refactor: Replace pexpect with libtmux in BashSession (#4881)

Co-authored-by: openhands <openhands@all-hands.dev> Co-authored-by: Engel Nyst <enyst@users.noreply.github.com> Co-authored-by: Robert Brennan <accounts@rbren.io>
2025-12-26 05:48:36 +08:00 · 2025-01-03 16:22:13 -05:00 · 2025-01-03 16:22:13 -05:00 · ec70af9412
commit ec70af9412
parent 761a574b09
66 changed files with 2345 additions and 760 deletions
--- a/.github/workflows/dummy-agent-test.yml
+++ b/.github/workflows/dummy-agent-test.yml
@ -36,6 +36,8 @@ jobs:
      - name: Set up Docker Buildx
        id: buildx
        uses: docker/setup-buildx-action@v3
+      - name: Install tmux
+        run: sudo apt-get update && sudo apt-get install -y tmux
      - name: Install poetry via pipx
        run: pipx install poetry
      - name: Set up Python
--- a/.github/workflows/eval-runner.yml
+++ b/.github/workflows/eval-runner.yml
@ -29,6 +29,8 @@ jobs:
      - name: Checkout repository
        uses: actions/checkout@v4

+      - name: Install tmux
+        run: sudo apt-get update && sudo apt-get install -y tmux
      - name: Install poetry via pipx
        run: pipx install poetry

--- a/.github/workflows/py-unit-tests-mac.yml
+++ b/.github/workflows/py-unit-tests-mac.yml
@ -31,6 +31,8 @@ jobs:
          key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
          restore-keys: |
            ${{ runner.os }}-poetry-
+      - name: Install tmux
+        run: brew install tmux
      - name: Install poetry via pipx
        run: pipx install poetry
      - name: Install Python dependencies using Poetry
--- a/.github/workflows/py-unit-tests.yml
+++ b/.github/workflows/py-unit-tests.yml
@ -30,6 +30,8 @@ jobs:
      - name: Set up Docker Buildx
        id: buildx
        uses: docker/setup-buildx-action@v3
+      - name: Install tmux
+        run: sudo apt-get update && sudo apt-get install -y tmux
      - name: Install poetry via pipx
        run: pipx install poetry
      - name: Set up Python
--- a/docs/static/img/backend_architecture.puml
+++ b/docs/static/img/backend_architecture.puml
@ -123,7 +123,6 @@ class openhands.state.State {
  updated_info: List[Tuple[Action, Observation]]
 }
 class openhands.observation.CmdOutputObservation {
-  command_id: int
  command: str
  exit_code: int
  observation: str
--- a/evaluation/benchmarks/agent_bench/run_infer.py
+++ b/evaluation/benchmarks/agent_bench/run_infer.py
@ -137,7 +137,6 @@ def complete_runtime(

        action = CmdRunAction(
            command=f'chmod +x ./{script_name} && ./{script_name}',
-            keep_prompt=False,
        )
        logger.info(action, extra={'msg_type': 'ACTION'})
        obs = runtime.run_action(action)
@ -164,8 +163,7 @@ def complete_runtime(
            logger.info(f'Running get ground truth cmd: {script_name}')

            action = CmdRunAction(
-                command=f'chmod +x ./{script_name} && ./{script_name}',
-                keep_prompt=False,
+                command=f'chmod +x ./{script_name} && ./{script_name}'
            )
            logger.info(action, extra={'msg_type': 'ACTION'})
            obs = runtime.run_action(action)
--- a/evaluation/benchmarks/aider_bench/run_infer.py
+++ b/evaluation/benchmarks/aider_bench/run_infer.py
@ -145,10 +145,7 @@ def complete_runtime(
        )
        logger.info(f'Running test file: {script_name}')

-    action = CmdRunAction(
-        command=f'python3 -m unittest {script_name}',
-        keep_prompt=False,
-    )
+    action = CmdRunAction(command=f'python3 -m unittest {script_name}')
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
--- a/evaluation/benchmarks/biocoder/run_infer.py
+++ b/evaluation/benchmarks/biocoder/run_infer.py
@ -199,7 +199,7 @@ def complete_runtime(
    if obs.exit_code == 0:
        test_result['metadata']['1_copy_change_success'] = True

-        action = CmdRunAction(command=f'cat {generated_path}', keep_prompt=False)
+        action = CmdRunAction(command=f'cat {generated_path}')
        logger.info(action, extra={'msg_type': 'ACTION'})
        obs = runtime.run_action(action)
        assert obs.exit_code == 0
@ -223,9 +223,7 @@ def complete_runtime(
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    assert obs.exit_code == 0

-    action = CmdRunAction(
-        command='cat /testing_files/results_biocoder.json', keep_prompt=False
-    )
+    action = CmdRunAction(command='cat /testing_files/results_biocoder.json')
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    if obs.exit_code == 0:
--- a/evaluation/benchmarks/bird/README.md
+++ b/evaluation/benchmarks/bird/README.md
@ -127,7 +127,6 @@ For each problem, OpenHands is given a set number of iterations to fix the faili
        "observation": "run",
        "content": "california_schools/california_schools.sqlite\r\n[(1.0,)]",
        "extras": {
-          "command_id": -1,
          "command": "python3 0.py",
          "exit_code": 0
        }
--- a/evaluation/benchmarks/bird/run_infer.py
+++ b/evaluation/benchmarks/bird/run_infer.py
@ -268,10 +268,7 @@ def initialize_runtime(
    runtime.copy_to(db_file, '/workspace')

    # Check the database is copied
-    action = CmdRunAction(
-        command='cd /workspace && ls -l',
-        keep_prompt=False,
-    )
+    action = CmdRunAction(command='cd /workspace && ls -l')
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    assert obs.exit_code == 0
@ -300,10 +297,7 @@ def complete_runtime(
    instance_id = instance.instance_id.replace('/', '__')
    path = os.path.join('/workspace', f'{instance_id}.py')

-    action = CmdRunAction(
-        command=f'cat {path}',
-        keep_prompt=False,
-    )
+    action = CmdRunAction(command=f'cat {path}')
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})

--- a/evaluation/benchmarks/humanevalfix/README.md
+++ b/evaluation/benchmarks/humanevalfix/README.md
@ -71,7 +71,6 @@ For each problem, OpenHands is given a set number of iterations to fix the faili
                "observation": "run",
                "content": "[File: /workspace/Python__2.py (14 lines total)]\r\n1:def truncate_number(number: float) -> float:\r\n2:    return number % 1.0 + 1.0\r\n3:\r\n4:\r\n5:\r\n6:\r\n7:\r\n8:\r\n9:def check(truncate_number):\r\n10:    assert truncate_number(3.5) == 0.5\r\n11:    assert abs(truncate_number(1.33) - 0.33) < 1e-6\r\n12:    assert abs(truncate_number(123.456) - 0.456) < 1e-6\r\n13:\r\n14:check(truncate_number)",
                "extras": {
-                    "command_id": -1,
                    "command": "open Python__2.py",
                    "exit_code": 0
                }
@ -98,7 +97,6 @@ For each problem, OpenHands is given a set number of iterations to fix the faili
                "observation": "run",
                "content": "> > [File: /workspace/Python__2.py (14 lines total)]\r\n1:def truncate_number(number: float) -> float:\r\n2:    return number % 1.0\r\n3:\r\n4:\r\n5:\r\n6:\r\n7:\r\n8:\r\n9:def check(truncate_number):\r\n10:    assert truncate_number(3.5) == 0.5\r\n11:    assert abs(truncate_number(1.33) - 0.33) < 1e-6\r\n12:    assert abs(truncate_number(123.456) - 0.456) < 1e-6\r\n13:\r\n14:check(truncate_number)\r\nFile updated. Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.",
                "extras": {
-                    "command_id": -1,
                    "command": "edit 2:2 <<EOF\n    return number % 1.0\nEOF",
                    "exit_code": 0
                }
@ -125,7 +123,6 @@ For each problem, OpenHands is given a set number of iterations to fix the faili
                "observation": "run",
                "content": "",
                "extras": {
-                    "command_id": -1,
                    "command": "python3 Python__2.py",
                    "exit_code": 0
                }
--- a/evaluation/benchmarks/humanevalfix/run_infer.py
+++ b/evaluation/benchmarks/humanevalfix/run_infer.py
@ -171,9 +171,7 @@ def complete_runtime(
    num_workers = LANGUAGE_TO_NUM_WORKERS[language]
    python_imports = '\n'.join(IMPORT_HELPER[language])

-    action = CmdRunAction(
-        command=f'cat /workspace/{_get_instance_id(instance)}.py', keep_prompt=False
-    )
+    action = CmdRunAction(command=f'cat /workspace/{_get_instance_id(instance)}.py')
    obs = runtime.run_action(action)
    assert obs.exit_code == 0

--- a/evaluation/benchmarks/ml_bench/run_infer.py
+++ b/evaluation/benchmarks/ml_bench/run_infer.py
@ -163,7 +163,7 @@ def complete_runtime(
    eval_script = os.path.join(task_path, 'run.sh')
    logger.info(f'Running evaluation script: {eval_script}')

-    action = CmdRunAction(command=f'cat {eval_script}', keep_prompt=False)
+    action = CmdRunAction(command=f'cat {eval_script}')
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    if obs.exit_code == 0:
--- a/evaluation/benchmarks/scienceagentbench/run_infer.py
+++ b/evaluation/benchmarks/scienceagentbench/run_infer.py
@ -121,10 +121,7 @@ def initialize_runtime(
    runtime.copy_to(dataset_dir, '/workspace/benchmark/datasets', recursive=True)

    # Check the dataset exists
-    action = CmdRunAction(
-        command='cd /workspace/benchmark/datasets && ls',
-        keep_prompt=False,
-    )
+    action = CmdRunAction(command='cd /workspace/benchmark/datasets && ls')
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    assert obs.exit_code == 0
@ -154,10 +151,7 @@ def complete_runtime(

    assert obs.exit_code == 0

-    action = CmdRunAction(
-        command=f'cat pred_programs/{instance.pred_program_name}',
-        keep_prompt=False,
-    )
+    action = CmdRunAction(command=f'cat pred_programs/{instance.pred_program_name}')
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)

--- a/evaluation/benchmarks/swe_bench/eval_infer.py
+++ b/evaluation/benchmarks/swe_bench/eval_infer.py
@ -98,6 +98,7 @@ def process_instance(
    metadata: EvalMetadata,
    reset_logger: bool = True,
    log_dir: str | None = None,
+    runtime_failure_count: int = 0,
 ) -> EvalOutput:
    """
    Evaluate agent performance on a SWE-bench problem instance.
@ -146,6 +147,16 @@ def process_instance(
            metadata=metadata,
        )

+    # Increase resource_factor with increasing attempt_id
+    if runtime_failure_count > 0:
+        config.sandbox.remote_runtime_resource_factor = min(
+            config.sandbox.remote_runtime_resource_factor * (2**runtime_failure_count),
+            4,  # hardcode maximum resource factor to 4
+        )
+        logger.warning(
+            f'This is the second attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}'
+        )
+
    runtime = create_runtime(config)
    call_async_from_sync(runtime.connect)
    # Get patch and save it to /tmp/patch.diff
@ -177,7 +188,7 @@ def process_instance(
        "(patch --batch --fuzz=5 -p1 -i /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || "
        "echo 'APPLY_PATCH_FAIL')))"
    )
-    action = CmdRunAction(command=exec_command, keep_prompt=False)
+    action = CmdRunAction(command=exec_command)
    action.timeout = 600
    obs = runtime.run_action(action)
    assert isinstance(obs, CmdOutputObservation)
@ -200,9 +211,7 @@ def process_instance(

            # Run eval script in background and save output to log file
            log_file = '/tmp/eval_output.log'
-            action = CmdRunAction(
-                command=f'/tmp/eval.sh > {log_file} 2>&1 & echo $!', keep_prompt=False
-            )
+            action = CmdRunAction(command=f'/tmp/eval.sh > {log_file} 2>&1 & echo $!')
            action.timeout = 60  # Short timeout just to get the process ID
            obs = runtime.run_action(action)

@ -224,7 +233,7 @@ def process_instance(
                        instance['test_result']['report']['test_timeout'] = True
                        break
                    check_action = CmdRunAction(
-                        command=f'ps -p {pid} > /dev/null; echo $?', keep_prompt=False
+                        command=f'ps -p {pid} > /dev/null; echo $?'
                    )
                    check_action.timeout = 60
                    check_obs = runtime.run_action(check_action)
@ -242,7 +251,7 @@ def process_instance(
                    time.sleep(30)  # Wait for 30 seconds before checking again

                # Read the log file
-                cat_action = CmdRunAction(command=f'cat {log_file}', keep_prompt=False)
+                cat_action = CmdRunAction(command=f'cat {log_file}')
                cat_action.timeout = 300
                cat_obs = runtime.run_action(cat_action)

--- a/evaluation/benchmarks/swe_bench/run_infer.py
+++ b/evaluation/benchmarks/swe_bench/run_infer.py
@ -282,6 +282,16 @@ def initialize_runtime(
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    assert_and_raise(obs.exit_code == 0, f'Failed to remove git remotes: {str(obs)}')

+    action = CmdRunAction(command='which python')
+    action.timeout = 600
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(
+        obs.exit_code == 0 and 'testbed' in obs.content,
+        f'Expected to find python interpreter from testbed, but got: {str(obs)}',
+    )
+
    logger.info('-' * 30)
    logger.info('END Runtime Initialization Fn')
    logger.info('-' * 30)
@ -337,8 +347,7 @@ def complete_runtime(
    git_patch = None
    while n_retries < 5:
        action = CmdRunAction(
-            command=f'git diff --no-color --cached {instance["base_commit"]}',
-            keep_prompt=False,
+            command=f'git diff --no-color --cached {instance["base_commit"]}'
        )
        action.timeout = 600 + 100 * n_retries
        logger.info(action, extra={'msg_type': 'ACTION'})
@ -385,7 +394,7 @@ def process_instance(
    if runtime_failure_count > 0:
        config.sandbox.remote_runtime_resource_factor = min(
            config.sandbox.remote_runtime_resource_factor * (2**runtime_failure_count),
-            2,  # hardcode maximum resource factor to 2
+            8,
        )
        logger.warning(
            f'This is the second attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}'
@ -535,4 +544,5 @@ if __name__ == '__main__':
        args.eval_num_workers,
        process_instance,
        timeout_seconds=120 * 60,  # 2 hour PER instance should be more than enough
+        max_retries=5,
    )
--- a/evaluation/benchmarks/swe_bench/scripts/eval/compare_outputs.py
+++ b/evaluation/benchmarks/swe_bench/scripts/eval/compare_outputs.py
@ -104,9 +104,9 @@ for repo, diff in repo_diffs:
    # Determine if this repo has a significant diff
    is_significant = diff >= threshold
    repo_color = 'red' if is_significant else 'yellow'
-    print(colored(f'Difference: {diff} instances!', repo_color, attrs=['bold']))

    print(f"\n{colored(repo, repo_color, attrs=['bold'])}:")
+    print(colored(f'Difference: {diff} instances!', repo_color, attrs=['bold']))
    print(colored(f'X resolved but Y failed: ({len(x_instances)} instances)', 'green'))
    if x_instances:
        print('  ' + str(x_instances))
--- a/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_md.py
+++ b/evaluation/benchmarks/swe_bench/scripts/eval/convert_oh_output_to_md.py
@ -20,6 +20,13 @@ output_md_folder = args.oh_output_file.replace('.jsonl', '.viz')
 print(f'Converting {args.oh_output_file} to markdown files in {output_md_folder}')

 oh_format = pd.read_json(args.oh_output_file, orient='records', lines=True)
+
+swebench_eval_file = args.oh_output_file.replace('.jsonl', '.swebench_eval.jsonl')
+if os.path.exists(swebench_eval_file):
+    eval_output_df = pd.read_json(swebench_eval_file, orient='records', lines=True)
+else:
+    eval_output_df = None
+
 # model name is the folder name of oh_output_file
 model_name = os.path.basename(os.path.dirname(args.oh_output_file))

@ -50,7 +57,7 @@ def convert_history_to_str(history):
    return ret


-def write_row_to_md_file(row):
+def write_row_to_md_file(row, instance_id_to_test_result):
    if 'git_patch' in row:
        model_patch = row['git_patch']
    elif 'test_result' in row and 'git_patch' in row['test_result']:
@ -58,8 +65,21 @@ def write_row_to_md_file(row):
    else:
        raise ValueError(f'Row {row} does not have a git_patch')

-    if 'report' in row:
-        resolved = row['report'].get('resolved', False)
+    test_output = None
+    if row['instance_id'] in instance_id_to_test_result:
+        report = instance_id_to_test_result[row['instance_id']].get('report', {})
+        resolved = report.get('resolved', False)
+        test_output = instance_id_to_test_result[row['instance_id']].get(
+            'test_output', None
+        )
+    elif 'report' in row and row['report'] is not None:
+        if not isinstance(row['report'], dict):
+            resolved = None
+            print(
+                f'ERROR: Report is not a dict, but a {type(row["report"])}. Row: {row}'
+            )
+        else:
+            resolved = row['report'].get('resolved', False)
    else:
        resolved = None

@ -84,5 +104,18 @@ def write_row_to_md_file(row):
        f.write('## Model Patch\n')
        f.write(f'{process_git_patch(model_patch)}\n')

+        f.write('## Test Output\n')
+        f.write(str(test_output))

-oh_format.progress_apply(write_row_to_md_file, axis=1)
+
+instance_id_to_test_result = {}
+if eval_output_df is not None:
+    instance_id_to_test_result = (
+        eval_output_df[['instance_id', 'test_result']]
+        .set_index('instance_id')['test_result']
+        .to_dict()
+    )
+
+oh_format.progress_apply(
+    write_row_to_md_file, axis=1, instance_id_to_test_result=instance_id_to_test_result
+)
--- a/evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py
+++ b/evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py
@ -111,6 +111,11 @@ elif os.path.exists(openhands_remote_report_jsonl):
        instance_id_to_status[row['instance_id']] = row['test_result']['report']
    df['report'] = df.apply(apply_report, axis=1)

+    report_is_dict = df['report'].apply(lambda x: isinstance(x, dict))
+    if not report_is_dict.all():
+        print(df[~report_is_dict])
+        raise ValueError(f'Report is not a dict, but a {type(row["report"])}')
+
    _n_instances = len(df)
    _n_resolved = len(df[df['report'].apply(lambda x: x.get('resolved', False))])
    _n_unresolved = _n_instances - _n_resolved
--- a/evaluation/integration_tests/tests/t01_fix_simple_typo.py
+++ b/evaluation/integration_tests/tests/t01_fix_simple_typo.py
@ -24,7 +24,7 @@ class Test(BaseIntegrationTest):
    @classmethod
    def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
        # check if the file /workspace/bad.txt has been fixed
-        action = CmdRunAction(command='cat /workspace/bad.txt', keep_prompt=False)
+        action = CmdRunAction(command='cat /workspace/bad.txt')
        obs = runtime.run_action(action)
        if obs.exit_code != 0:
            return TestResult(
--- a/evaluation/integration_tests/tests/t02_add_bash_hello.py
+++ b/evaluation/integration_tests/tests/t02_add_bash_hello.py
@ -10,14 +10,14 @@ class Test(BaseIntegrationTest):

    @classmethod
    def initialize_runtime(cls, runtime: Runtime) -> None:
-        action = CmdRunAction(command='mkdir -p /workspace', keep_prompt=False)
+        action = CmdRunAction(command='mkdir -p /workspace')
        obs = runtime.run_action(action)
        assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')

    @classmethod
    def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
        # check if the file /workspace/hello.sh exists
-        action = CmdRunAction(command='cat /workspace/hello.sh', keep_prompt=False)
+        action = CmdRunAction(command='cat /workspace/hello.sh')
        obs = runtime.run_action(action)
        if obs.exit_code != 0:
            return TestResult(
@ -26,7 +26,7 @@ class Test(BaseIntegrationTest):
            )

        # execute the script
-        action = CmdRunAction(command='bash /workspace/hello.sh', keep_prompt=False)
+        action = CmdRunAction(command='bash /workspace/hello.sh')
        obs = runtime.run_action(action)
        if obs.exit_code != 0:
            return TestResult(
--- a/evaluation/integration_tests/tests/t03_jupyter_write_file.py
+++ b/evaluation/integration_tests/tests/t03_jupyter_write_file.py
@ -10,14 +10,14 @@ class Test(BaseIntegrationTest):

    @classmethod
    def initialize_runtime(cls, runtime: Runtime) -> None:
-        action = CmdRunAction(command='mkdir -p /workspace', keep_prompt=False)
+        action = CmdRunAction(command='mkdir -p /workspace')
        obs = runtime.run_action(action)
        assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')

    @classmethod
    def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
        # check if the file /workspace/hello.sh exists
-        action = CmdRunAction(command='cat /workspace/test.txt', keep_prompt=False)
+        action = CmdRunAction(command='cat /workspace/test.txt')
        obs = runtime.run_action(action)
        if obs.exit_code != 0:
            return TestResult(
@ -26,7 +26,7 @@ class Test(BaseIntegrationTest):
            )

        # execute the script
-        action = CmdRunAction(command='cat /workspace/test.txt', keep_prompt=False)
+        action = CmdRunAction(command='cat /workspace/test.txt')
        obs = runtime.run_action(action)

        if obs.exit_code != 0:
--- a/evaluation/integration_tests/tests/t04_git_staging.py
+++ b/evaluation/integration_tests/tests/t04_git_staging.py
@ -10,31 +10,29 @@ class Test(BaseIntegrationTest):

    @classmethod
    def initialize_runtime(cls, runtime: Runtime) -> None:
-        action = CmdRunAction(command='mkdir -p /workspace', keep_prompt=False)
+        action = CmdRunAction(command='mkdir -p /workspace')
        obs = runtime.run_action(action)
        assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')

        # git init
-        action = CmdRunAction(command='git init', keep_prompt=False)
+        action = CmdRunAction(command='git init')
        obs = runtime.run_action(action)
        assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')

        # create README.md
-        action = CmdRunAction(
-            command='echo \'print("hello world")\' > hello.py', keep_prompt=False
-        )
+        action = CmdRunAction(command='echo \'print("hello world")\' > hello.py')
        obs = runtime.run_action(action)
        assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')

        # git add README.md
-        action = CmdRunAction(command='git add hello.py', keep_prompt=False)
+        action = CmdRunAction(command='git add hello.py')
        obs = runtime.run_action(action)
        assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')

    @classmethod
    def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
        # check if the file /workspace/hello.py exists
-        action = CmdRunAction(command='cat /workspace/hello.py', keep_prompt=False)
+        action = CmdRunAction(command='cat /workspace/hello.py')
        obs = runtime.run_action(action)
        if obs.exit_code != 0:
            return TestResult(
@ -43,7 +41,7 @@ class Test(BaseIntegrationTest):
            )

        # check if the staging area is empty
-        action = CmdRunAction(command='git status', keep_prompt=False)
+        action = CmdRunAction(command='git status')
        obs = runtime.run_action(action)
        if obs.exit_code != 0:
            return TestResult(
--- a/evaluation/integration_tests/tests/t05_simple_browsing.py
+++ b/evaluation/integration_tests/tests/t05_simple_browsing.py
@ -83,11 +83,11 @@ class Test(BaseIntegrationTest):

    @classmethod
    def initialize_runtime(cls, runtime: Runtime) -> None:
-        action = CmdRunAction(command='mkdir -p /workspace', keep_prompt=False)
+        action = CmdRunAction(command='mkdir -p /workspace')
        obs = runtime.run_action(action)
        assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')

-        action = CmdRunAction(command='mkdir -p /tmp/server', keep_prompt=False)
+        action = CmdRunAction(command='mkdir -p /tmp/server')
        obs = runtime.run_action(action)
        assert_and_raise(obs.exit_code == 0, f'Failed to run command: {obs.content}')

@ -101,8 +101,7 @@ class Test(BaseIntegrationTest):

        # create README.md
        action = CmdRunAction(
-            command='cd /tmp/server && nohup python3 -m http.server 8000 &',
-            keep_prompt=False,
+            command='cd /tmp/server && nohup python3 -m http.server 8000 &'
        )
        obs = runtime.run_action(action)

--- a/frontend/src/services/actions.ts
+++ b/frontend/src/services/actions.ts
@ -16,6 +16,7 @@ import {
  StatusMessage,
 } from "#/types/message";
 import { handleObservationMessage } from "./observations";
+import { appendInput } from "#/state/command-slice";

 const messageActions = {
  [ActionType.BROWSE]: (message: ActionMessage) => {
@ -62,6 +63,10 @@ export function handleActionMessage(message: ActionMessage) {
    return;
  }

+  if (message.action === ActionType.RUN) {
+    store.dispatch(appendInput(message.args.command));
+  }
+
  if ("args" in message && "security_risk" in message.args) {
    store.dispatch(appendSecurityAnalyzerInput(message));
  }
--- a/frontend/src/services/observations.ts
+++ b/frontend/src/services/observations.ts
@ -80,8 +80,7 @@ export function handleObservationMessage(message: ObservationMessage) {
            observation: "run" as const,
            extras: {
              command: String(message.extras.command || ""),
-              command_id: Number(message.extras.command_id || 0),
-              exit_code: Number(message.extras.exit_code || 0),
+              metadata: message.extras.metadata,
              hidden: Boolean(message.extras.hidden),
            },
          }),
--- a/frontend/src/state/chat-slice.ts
+++ b/frontend/src/state/chat-slice.ts
@ -93,7 +93,7 @@ export const chatSlice = createSlice({
      const translationID = `ACTION_MESSAGE$${actionID.toUpperCase()}`;
      let text = "";
      if (actionID === "run") {
-        text = `\`${action.payload.args.command}\``;
+        text = `Command:\n\`${action.payload.args.command}\``;
      } else if (actionID === "run_ipython") {
        text = `\`\`\`\n${action.payload.args.code}\n\`\`\``;
      } else if (actionID === "write") {
@ -144,7 +144,7 @@ export const chatSlice = createSlice({
      // Set success property based on observation type
      if (observationID === "run") {
        const commandObs = observation.payload as CommandObservation;
-        causeMessage.success = commandObs.extras.exit_code === 0;
+        causeMessage.success = commandObs.extras.metadata.exit_code === 0;
      } else if (observationID === "run_ipython") {
        // For IPython, we consider it successful if there's no error message
        const ipythonObs = observation.payload as IPythonObservation;
@ -158,7 +158,9 @@ export const chatSlice = createSlice({
        if (content.length > MAX_CONTENT_LENGTH) {
          content = `${content.slice(0, MAX_CONTENT_LENGTH)}...`;
        }
-        content = `\`\`\`\n${content}\n\`\`\``;
+        content = `${
+          causeMessage.content
+        }\n\nOutput:\n\`\`\`\n${content.trim() || "[Command finished execution with no output]"}\n\`\`\``;
        causeMessage.content = content; // Observation content includes the action
      } else if (observationID === "read" || observationID === "edit") {
        const { content } = observation.payload;
--- a/frontend/src/types/core/observations.ts
+++ b/frontend/src/types/core/observations.ts
@ -13,9 +13,8 @@ export interface CommandObservation extends OpenHandsObservationEvent<"run"> {
  source: "agent";
  extras: {
    command: string;
-    command_id: number;
-    exit_code: number;
    hidden?: boolean;
+    metadata: Record<string, unknown>;
  };
 }

--- a/frontend/src/types/message.tsx
+++ b/frontend/src/types/message.tsx
@ -27,8 +27,11 @@ export interface ObservationMessage {
  // The observed data
  content: string;

-  // Additional structured data
-  extras: Record<string, string>;
+  extras: {
+    metadata: Record<string, unknown>;
+    error_id: string;
+    [key: string]: string | Record<string, unknown>;
+  };

  // A friendly message that can be put in the chat log
  message: string;
--- a/openhands/agenthub/codeact_agent/codeact_agent.py
+++ b/openhands/agenthub/codeact_agent/codeact_agent.py
@ -277,7 +277,9 @@ class CodeActAgent(Agent):
                )
            else:
                text = truncate_content(
-                    obs.content + obs.interpreter_details, max_message_chars
+                    obs.content
+                    + f'\n[Python Interpreter: {obs.metadata.py_interpreter_path}]',
+                    max_message_chars,
                )
            text += f'\n[Command finished with exit code {obs.exit_code}]'
            message = Message(role='user', content=[TextContent(text=text)])
--- a/openhands/agenthub/codeact_agent/function_calling.py
+++ b/openhands/agenthub/codeact_agent/function_calling.py
@ -31,8 +31,7 @@ from openhands.events.tool import ToolCallMetadata

 _BASH_DESCRIPTION = """Execute a bash command in the terminal.
 * Long running commands: For commands that may run indefinitely, it should be run in the background and the output should be redirected to a file, e.g. command = `python3 app.py > server.log 2>&1 &`.
-* Interactive: If a bash command returns exit code `-1`, this means the process is not yet finished. The assistant must then send a second call to terminal with an empty `command` (which will retrieve any additional logs), or it can send additional text (set `command` to the text) to STDIN of the running process, or it can send command=`ctrl+c` to interrupt the process.
-* Timeout: If a command execution result says "Command timed out. Sending SIGINT to the process", the assistant should retry running the command in the background.
+* Interactive: If a bash command returns exit code `-1`, this means the process is not yet finished. The assistant must then send a second call to terminal with an empty `command` (which will retrieve any additional logs), or it can send additional text (set `command` to the text) to STDIN of the running process, or it can send command like `C-c` (Ctrl+C) to interrupt the process.
 """

 CmdRunTool = ChatCompletionToolParam(
@ -45,7 +44,7 @@ CmdRunTool = ChatCompletionToolParam(
            'properties': {
                'command': {
                    'type': 'string',
-                    'description': 'The bash command to execute. Can be empty to view additional logs when previous exit code is `-1`. Can be `ctrl+c` to interrupt the currently running process.',
+                    'description': 'The bash command to execute. Can be empty string to view additional logs when previous exit code is `-1`. Can be `C-c` (Ctrl+C) to interrupt the currently running process.',
                },
            },
            'required': ['command'],
--- a/openhands/agenthub/dummy_agent/agent.py
+++ b/openhands/agenthub/dummy_agent/agent.py
@ -18,6 +18,7 @@ from openhands.events.action import (
 from openhands.events.observation import (
    AgentStateChangedObservation,
    BrowserOutputObservation,
+    CmdOutputMetadata,
    CmdOutputObservation,
    FileReadObservation,
    FileWriteObservation,
@ -54,11 +55,7 @@ class DummyAgent(Agent):
            },
            {
                'action': CmdRunAction(command='echo "foo"'),
-                'observations': [
-                    CmdOutputObservation(
-                        'foo', command_id=-1, command='echo "foo"', exit_code=0
-                    )
-                ],
+                'observations': [CmdOutputObservation('foo', command='echo "foo"')],
            },
            {
                'action': FileWriteAction(
@ -81,9 +78,8 @@ class DummyAgent(Agent):
                'observations': [
                    CmdOutputObservation(
                        'bash: hello.sh: No such file or directory',
-                        command_id=-1,
                        command='bash workspace/hello.sh',
-                        exit_code=127,
+                        metadata=CmdOutputMetadata(exit_code=127),
                    )
                ],
            },
@ -152,8 +148,6 @@ class DummyAgent(Agent):
                        obs.pop('timestamp', None)
                        obs.pop('cause', None)
                        obs.pop('source', None)
-                        if 'extras' in obs:
-                            obs['extras'].pop('command_id', None)

                    if hist_obs != expected_obs:
                        print(
--- a/openhands/controller/stuck.py
+++ b/openhands/controller/stuck.py
@ -5,7 +5,7 @@ from openhands.events.action.commands import IPythonRunCellAction
 from openhands.events.action.empty import NullAction
 from openhands.events.action.message import MessageAction
 from openhands.events.event import Event, EventSource
-from openhands.events.observation.commands import (
+from openhands.events.observation import (
    CmdOutputObservation,
    IPythonRunCellObservation,
 )
--- a/openhands/core/utils/json.py
+++ b/openhands/core/utils/json.py
@ -6,6 +6,7 @@ from litellm.types.utils import ModelResponse

 from openhands.core.exceptions import LLMResponseError
 from openhands.events.event import Event
+from openhands.events.observation import CmdOutputMetadata
 from openhands.events.serialization import event_to_dict
 from openhands.llm.metrics import Metrics

@ -20,6 +21,8 @@ def my_default_encoder(obj):
        return obj.get()
    if isinstance(obj, ModelResponse):
        return obj.model_dump()
+    if isinstance(obj, CmdOutputMetadata):
+        return obj.model_dump()
    return json.JSONEncoder().default(obj)


--- a/openhands/events/action/commands.py
+++ b/openhands/events/action/commands.py
@ -12,19 +12,11 @@ from openhands.events.action.action import (
@dataclass
 class CmdRunAction(Action):
    command: str
+    # When `command` is empty, it will be used to print the current tmux window
    thought: str = ''
    blocking: bool = False
-    # If False, the command will be run in a non-blocking / interactive way
-    # The partial command outputs will be returned as output observation.
-    # If True, the command will be run for max .timeout seconds.
-    keep_prompt: bool = True
-    # if True, the command prompt will be kept in the command output observation
-    # Example of command output:
-    # root@sandbox:~# ls
-    # file1.txt
-    # file2.txt
-    # root@sandbox:~# <-- this is the command prompt
-
+    # If blocking is True, the command will be run in a blocking manner.
+    # e.g., it will NOT return early due to soft timeout.
    hidden: bool = False
    action: str = ActionType.RUN
    runnable: ClassVar[bool] = True
--- a/openhands/events/action/files.py
+++ b/openhands/events/action/files.py
@ -48,6 +48,15 @@ class FileWriteAction(Action):
    def message(self) -> str:
        return f'Writing file: {self.path}'

+    def __repr__(self) -> str:
+        return (
+            f'**FileWriteAction**\n'
+            f'Path: {self.path}\n'
+            f'Range: [L{self.start}:L{self.end}]\n'
+            f'Thought: {self.thought}\n'
+            f'Content:\n```\n{self.content}\n```\n'
+        )
+

@dataclass
 class FileEditAction(Action):
--- a/openhands/events/observation/init.py
+++ b/openhands/events/observation/init.py
@ -1,6 +1,7 @@
 from openhands.events.observation.agent import AgentStateChangedObservation
 from openhands.events.observation.browse import BrowserOutputObservation
 from openhands.events.observation.commands import (
+    CmdOutputMetadata,
    CmdOutputObservation,
    IPythonRunCellObservation,
 )
@ -20,6 +21,7 @@ __all__ = [
    'Observation',
    'NullObservation',
    'CmdOutputObservation',
+    'CmdOutputMetadata',
    'IPythonRunCellObservation',
    'BrowserOutputObservation',
    'FileReadObservation',
--- a/openhands/events/observation/commands.py
+++ b/openhands/events/observation/commands.py
@ -1,19 +1,136 @@
-from dataclasses import dataclass
+import json
+import re
+import traceback
+from dataclasses import dataclass, field
+from typing import Self

+from pydantic import BaseModel
+
+from openhands.core.logger import openhands_logger as logger
 from openhands.core.schema import ObservationType
 from openhands.events.observation.observation import Observation

+CMD_OUTPUT_PS1_BEGIN = '\n###PS1JSON###\n'
+CMD_OUTPUT_PS1_END = '\n###PS1END###'
+CMD_OUTPUT_METADATA_PS1_REGEX = re.compile(
+    f'^{CMD_OUTPUT_PS1_BEGIN.strip()}(.*?){CMD_OUTPUT_PS1_END.strip()}',
+    re.DOTALL | re.MULTILINE,
+)
+
+
+class CmdOutputMetadata(BaseModel):
+    """Additional metadata captured from PS1"""
+
+    exit_code: int = -1
+    pid: int = -1
+    username: str | None = None
+    hostname: str | None = None
+    working_dir: str | None = None
+    py_interpreter_path: str | None = None
+    prefix: str = ''  # Prefix to add to command output
+    suffix: str = ''  # Suffix to add to command output
+
+    @classmethod
+    def to_ps1_prompt(cls) -> str:
+        """Convert the required metadata into a PS1 prompt."""
+        prompt = CMD_OUTPUT_PS1_BEGIN
+        json_str = json.dumps(
+            {
+                'pid': '$!',
+                'exit_code': '$?',
+                'username': r'\u',
+                'hostname': r'\h',
+                'working_dir': r'$(pwd)',
+                'py_interpreter_path': r'$(which python 2>/dev/null || echo "")',
+            },
+            indent=2,
+        )
+        # Make sure we escape double quotes in the JSON string
+        # So that PS1 will keep them as part of the output
+        prompt += json_str.replace('"', r'\"')
+        prompt += CMD_OUTPUT_PS1_END + '\n'  # Ensure there's a newline at the end
+        return prompt
+
+    @classmethod
+    def matches_ps1_metadata(cls, string: str) -> list[re.Match[str]]:
+        matches = []
+        for match in CMD_OUTPUT_METADATA_PS1_REGEX.finditer(string):
+            try:
+                json.loads(match.group(1).strip())  # Try to parse as JSON
+                matches.append(match)
+            except json.JSONDecodeError:
+                logger.warning(
+                    f'Failed to parse PS1 metadata: {match.group(1)}. Skipping.'
+                    + traceback.format_exc()
+                )
+                continue  # Skip if not valid JSON
+        return matches
+
+    @classmethod
+    def from_ps1_match(cls, match: re.Match[str]) -> Self:
+        """Extract the required metadata from a PS1 prompt."""
+        metadata = json.loads(match.group(1))
+        # Create a copy of metadata to avoid modifying the original
+        processed = metadata.copy()
+        # Convert numeric fields
+        if 'pid' in metadata:
+            try:
+                processed['pid'] = int(float(str(metadata['pid'])))
+            except (ValueError, TypeError):
+                processed['pid'] = -1
+        if 'exit_code' in metadata:
+            try:
+                processed['exit_code'] = int(float(str(metadata['exit_code'])))
+            except (ValueError, TypeError):
+                logger.warning(
+                    f'Failed to parse exit code: {metadata["exit_code"]}. Setting to -1.'
+                )
+                processed['exit_code'] = -1
+        return cls(**processed)
+

@dataclass
 class CmdOutputObservation(Observation):
    """This data class represents the output of a command."""

-    command_id: int
    command: str
-    exit_code: int = 0
-    hidden: bool = False
    observation: str = ObservationType.RUN
-    interpreter_details: str = ''
+    # Additional metadata captured from PS1
+    metadata: CmdOutputMetadata = field(default_factory=CmdOutputMetadata)
+    # Whether the command output should be hidden from the user
+    hidden: bool = False
+
+    def __init__(
+        self,
+        content: str,
+        command: str,
+        observation: str = ObservationType.RUN,
+        metadata: dict | CmdOutputMetadata | None = None,
+        hidden: bool = False,
+        **kwargs,
+    ):
+        super().__init__(content)
+        self.command = command
+        self.observation = observation
+        self.hidden = hidden
+        if isinstance(metadata, dict):
+            self.metadata = CmdOutputMetadata(**metadata)
+        else:
+            self.metadata = metadata or CmdOutputMetadata()
+
+        # Handle legacy attribute
+        if 'exit_code' in kwargs:
+            self.metadata.exit_code = kwargs['exit_code']
+        if 'command_id' in kwargs:
+            self.metadata.pid = kwargs['command_id']
+
+    @property
+    def command_id(self) -> int:
+        return self.metadata.pid
+
+    @property
+    def exit_code(self) -> int:
+        return self.metadata.exit_code

    @property
    def error(self) -> bool:
@ -28,7 +145,21 @@ class CmdOutputObservation(Observation):
        return not self.error

    def __str__(self) -> str:
-        return f'**CmdOutputObservation (source={self.source}, exit code={self.exit_code})**\n{self.content}'
+        return (
+            f'**CmdOutputObservation (source={self.source}, exit code={self.exit_code}, '
+            f'metadata={json.dumps(self.metadata.model_dump(), indent=2)})**\n'
+            '--BEGIN AGENT OBSERVATION--\n'
+            f'{self._to_agent_observation()}\n'
+            '--END AGENT OBSERVATION--'
+        )
+
+    def _to_agent_observation(self) -> str:
+        ret = f'{self.metadata.prefix}{self.content}{self.metadata.suffix}'
+        if self.metadata.working_dir:
+            ret += f'\n[Current working directory: {self.metadata.working_dir}]'
+        if self.metadata.py_interpreter_path:
+            ret += f'\n[Python interpreter: {self.metadata.py_interpreter_path}]'
+        return ret


@dataclass
--- a/openhands/events/observation/files.py
+++ b/openhands/events/observation/files.py
@ -18,6 +18,9 @@ class FileReadObservation(Observation):
    def message(self) -> str:
        return f'I read the file {self.path}.'

+    def __str__(self) -> str:
+        return f'[Read from {self.path} is successful.]\n' f'{self.content}'
+

@dataclass
 class FileWriteObservation(Observation):
@ -30,6 +33,9 @@ class FileWriteObservation(Observation):
    def message(self) -> str:
        return f'I wrote to the file {self.path}.'

+    def __str__(self) -> str:
+        return f'[Write to {self.path} is successful.]\n' f'{self.content}'
+

@dataclass
 class FileEditObservation(Observation):
--- a/openhands/events/serialization/action.py
+++ b/openhands/events/serialization/action.py
@ -67,6 +67,10 @@ def action_from_dict(action: dict) -> Action:
    if 'images_urls' in args:
        args['image_urls'] = args.pop('images_urls')

+    # keep_prompt has been deprecated in https://github.com/All-Hands-AI/OpenHands/pull/4881
+    if 'keep_prompt' in args:
+        args.pop('keep_prompt')
+
    try:
        decoded_action = action_class(**args)
        if 'timeout' in action:
--- a/openhands/events/serialization/event.py
+++ b/openhands/events/serialization/event.py
@ -1,6 +1,8 @@
 from dataclasses import asdict
 from datetime import datetime

+from pydantic import BaseModel
+
 from openhands.events import Event, EventSource
 from openhands.events.observation.observation import Observation
 from openhands.events.serialization.action import action_from_dict
@ -56,6 +58,12 @@ def event_from_dict(data) -> 'Event':
    return evt


+def _convert_pydantic_to_dict(obj: BaseModel | dict) -> dict:
+    if isinstance(obj, BaseModel):
+        return obj.model_dump()
+    return obj
+
+
 def event_to_dict(event: 'Event') -> dict:
    props = asdict(event)
    d = {}
@ -82,7 +90,11 @@ def event_to_dict(event: 'Event') -> dict:
            d['timeout'] = event.timeout
    elif 'observation' in d:
        d['content'] = props.pop('content', '')
-        d['extras'] = props
+
+        # props is a dict whose values can include a complex object like an instance of a BaseModel subclass
+        # such as CmdOutputMetadata
+        # we serialize it along with the rest
+        d['extras'] = {k: _convert_pydantic_to_dict(v) for k, v in props.items()}
        # Include success field for CmdOutputObservation
        if hasattr(event, 'success'):
            d['success'] = event.success
@ -109,7 +121,6 @@ def event_to_memory(event: 'Event', max_message_chars: int) -> dict:
    # runnable actions have some extra fields used in the BE/FE, which should not be sent to the LLM
    if 'args' in d:
        d['args'].pop('blocking', None)
-        d['args'].pop('keep_prompt', None)
        d['args'].pop('confirmation_state', None)

    if 'extras' in d:
--- a/openhands/events/serialization/observation.py
+++ b/openhands/events/serialization/observation.py
@ -1,6 +1,9 @@
+import copy
+
 from openhands.events.observation.agent import AgentStateChangedObservation
 from openhands.events.observation.browse import BrowserOutputObservation
 from openhands.events.observation.commands import (
+    CmdOutputMetadata,
    CmdOutputObservation,
    IPythonRunCellObservation,
 )
@ -37,6 +40,26 @@ OBSERVATION_TYPE_TO_CLASS = {
 }


+def _update_cmd_output_metadata(
+    metadata: dict | CmdOutputMetadata | None, **kwargs
+) -> dict | CmdOutputMetadata:
+    """Update the metadata of a CmdOutputObservation.
+
+    If metadata is None, create a new CmdOutputMetadata instance.
+    If metadata is a dict, update the dict.
+    If metadata is a CmdOutputMetadata instance, update the instance.
+    """
+    if metadata is None:
+        return CmdOutputMetadata(**kwargs)
+
+    if isinstance(metadata, dict):
+        metadata.update(**kwargs)
+    elif isinstance(metadata, CmdOutputMetadata):
+        for key, value in kwargs.items():
+            setattr(metadata, key, value)
+    return metadata
+
+
 def observation_from_dict(observation: dict) -> Observation:
    observation = observation.copy()
    if 'observation' not in observation:
@ -49,6 +72,24 @@ def observation_from_dict(observation: dict) -> Observation:
    observation.pop('observation')
    observation.pop('message', None)
    content = observation.pop('content', '')
-    extras = observation.pop('extras', {})
+    extras = copy.deepcopy(observation.pop('extras', {}))
+
+    # Handle legacy attributes for CmdOutputObservation
+    if 'exit_code' in extras:
+        extras['metadata'] = _update_cmd_output_metadata(
+            extras.get('metadata', None), exit_code=extras.pop('exit_code')
+        )
+    if 'command_id' in extras:
+        extras['metadata'] = _update_cmd_output_metadata(
+            extras.get('metadata', None), pid=extras.pop('command_id')
+        )
+    # convert metadata to CmdOutputMetadata if it is a dict
+    if observation_class is CmdOutputObservation:
+        if 'metadata' in extras and isinstance(extras['metadata'], dict):
+            extras['metadata'] = CmdOutputMetadata(**extras['metadata'])
+        elif 'metadata' in extras and isinstance(extras['metadata'], CmdOutputMetadata):
+            pass
+        else:
+            extras['metadata'] = CmdOutputMetadata()

    return observation_class(content=content, **extras)
--- a/openhands/events/utils.py
+++ b/openhands/events/utils.py
@ -2,9 +2,11 @@ from openhands.core.logger import openhands_logger as logger
 from openhands.events.action.action import Action
 from openhands.events.action.empty import NullAction
 from openhands.events.event import Event
-from openhands.events.observation.commands import CmdOutputObservation
-from openhands.events.observation.empty import NullObservation
-from openhands.events.observation.observation import Observation
+from openhands.events.observation import (
+    CmdOutputObservation,
+    NullObservation,
+    Observation,
+)


 def get_pairs_from_events(events: list[Event]) -> list[tuple[Action, Observation]]:
--- a/openhands/resolver/resolve_issue.py
+++ b/openhands/resolver/resolve_issue.py
@ -122,10 +122,7 @@ async def complete_runtime(
    n_retries = 0
    git_patch = None
    while n_retries < 5:
-        action = CmdRunAction(
-            command=f'git diff --no-color --cached {base_commit}',
-            keep_prompt=False,
-        )
+        action = CmdRunAction(command=f'git diff --no-color --cached {base_commit}')
        action.timeout = 600 + 100 * n_retries
        logger.info(action, extra={'msg_type': 'ACTION'})
        obs = runtime.run_action(action)
--- a/openhands/runtime/action_execution_server.py
+++ b/openhands/runtime/action_execution_server.py
@ -95,31 +95,34 @@ class ActionExecutor:
        browsergym_eval_env: str | None,
    ) -> None:
        self.plugins_to_load = plugins_to_load
-        self._initial_pwd = work_dir
+        self._initial_cwd = work_dir
        self.username = username
        self.user_id = user_id
        _updated_user_id = init_user_and_working_directory(
-            username=username, user_id=self.user_id, initial_pwd=work_dir
+            username=username, user_id=self.user_id, initial_cwd=work_dir
        )
        if _updated_user_id is not None:
            self.user_id = _updated_user_id

-        self.bash_session = BashSession(
-            work_dir=work_dir,
-            username=username,
-        )
-
+        self.bash_session: BashSession | None = None
        self.lock = asyncio.Lock()
        self.plugins: dict[str, Plugin] = {}
        self.browser = BrowserEnv(browsergym_eval_env)
        self.start_time = time.time()
        self.last_execution_time = self.start_time
+        self._initialized = False

    @property
-    def initial_pwd(self):
-        return self._initial_pwd
+    def initial_cwd(self):
+        return self._initial_cwd

    async def ainit(self):
+        # bash needs to be initialized first
+        self.bash_session = BashSession(
+            work_dir=self._initial_cwd,
+            username=self.username,
+        )
+        self.bash_session.initialize()
        await wait_all(
            (self._init_plugin(plugin) for plugin in self.plugins_to_load),
            timeout=30,
@ -138,8 +141,14 @@ class ActionExecutor:

        await self._init_bash_commands()
        logger.debug('Runtime client initialized.')
+        self._initialized = True
+
+    @property
+    def initialized(self) -> bool:
+        return self._initialized

    async def _init_plugin(self, plugin: Plugin):
+        assert self.bash_session is not None
        await plugin.initialize(self.username)
        self.plugins[plugin.name] = plugin
        logger.debug(f'Initializing plugin: {plugin.name}')
@ -147,7 +156,7 @@ class ActionExecutor:
        if isinstance(plugin, JupyterPlugin):
            await self.run_ipython(
                IPythonRunCellAction(
-                    code=f'import os; os.chdir("{self.bash_session.pwd}")'
+                    code=f'import os; os.chdir("{self.bash_session.cwd}")'
                )
            )

@ -177,30 +186,32 @@ class ActionExecutor:
    async def run(
        self, action: CmdRunAction
    ) -> CmdOutputObservation | ErrorObservation:
-        obs = await call_sync_from_async(self.bash_session.run, action)
+        assert self.bash_session is not None
+        obs = await call_sync_from_async(self.bash_session.execute, action)
        return obs

    async def run_ipython(self, action: IPythonRunCellAction) -> Observation:
+        assert self.bash_session is not None
        if 'jupyter' in self.plugins:
            _jupyter_plugin: JupyterPlugin = self.plugins['jupyter']  # type: ignore
            # This is used to make AgentSkills in Jupyter aware of the
            # current working directory in Bash
-            jupyter_pwd = getattr(self, '_jupyter_pwd', None)
-            if self.bash_session.pwd != jupyter_pwd:
+            jupyter_cwd = getattr(self, '_jupyter_cwd', None)
+            if self.bash_session.cwd != jupyter_cwd:
                logger.debug(
-                    f'{self.bash_session.pwd} != {jupyter_pwd} -> reset Jupyter PWD'
+                    f'{self.bash_session.cwd} != {jupyter_cwd} -> reset Jupyter PWD'
                )
-                reset_jupyter_pwd_code = (
-                    f'import os; os.chdir("{self.bash_session.pwd}")'
+                reset_jupyter_cwd_code = (
+                    f'import os; os.chdir("{self.bash_session.cwd}")'
                )
-                _aux_action = IPythonRunCellAction(code=reset_jupyter_pwd_code)
+                _aux_action = IPythonRunCellAction(code=reset_jupyter_cwd_code)
                _reset_obs: IPythonRunCellObservation = await _jupyter_plugin.run(
                    _aux_action
                )
                logger.debug(
-                    f'Changed working directory in IPython to: {self.bash_session.pwd}. Output: {_reset_obs}'
+                    f'Changed working directory in IPython to: {self.bash_session.cwd}. Output: {_reset_obs}'
                )
-                self._jupyter_pwd = self.bash_session.pwd
+                self._jupyter_cwd = self.bash_session.cwd

            obs: IPythonRunCellObservation = await _jupyter_plugin.run(action)
            obs.content = obs.content.rstrip()
@ -266,7 +277,7 @@ class ActionExecutor:

            if action.include_extra:
                obs.content += (
-                    f'\n[Jupyter current working directory: {self.bash_session.pwd}]'
+                    f'\n[Jupyter current working directory: {self.bash_session.cwd}]'
                )
                obs.content += f'\n[Jupyter Python interpreter: {_jupyter_plugin.python_interpreter_path}]'
            return obs
@ -282,6 +293,7 @@ class ActionExecutor:
        return str(filepath)

    async def read(self, action: FileReadAction) -> Observation:
+        assert self.bash_session is not None
        if action.impl_source == FileReadSource.OH_ACI:
            return await self.run_ipython(
                IPythonRunCellAction(
@ -292,7 +304,7 @@ class ActionExecutor:

        # NOTE: the client code is running inside the sandbox,
        # so there's no need to check permission
-        working_dir = self.bash_session.workdir
+        working_dir = self.bash_session.cwd
        filepath = self._resolve_path(action.path, working_dir)
        try:
            if filepath.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
@ -339,7 +351,8 @@ class ActionExecutor:
        return FileReadObservation(path=filepath, content=code_view)

    async def write(self, action: FileWriteAction) -> Observation:
-        working_dir = self.bash_session.workdir
+        assert self.bash_session is not None
+        working_dir = self.bash_session.cwd
        filepath = self._resolve_path(action.path, working_dir)

        insert = action.content.split('\n')
@ -400,7 +413,8 @@ class ActionExecutor:
        return await browse(action, self.browser)

    def close(self):
-        self.bash_session.close()
+        if self.bash_session is not None:
+            self.bash_session.close()
        self.browser.close()


@ -609,6 +623,8 @@ if __name__ == '__main__':

    @app.get('/alive')
    async def alive():
+        if client is None or not client.initialized:
+            return {'status': 'not initialized'}
        return {'status': 'ok'}

    # ================================
@ -658,11 +674,11 @@ if __name__ == '__main__':

        # Get the full path of the requested directory
        if path is None:
-            full_path = client.initial_pwd
+            full_path = client.initial_cwd
        elif os.path.isabs(path):
            full_path = path
        else:
-            full_path = os.path.join(client.initial_pwd, path)
+            full_path = os.path.join(client.initial_cwd, path)

        if not os.path.exists(full_path):
            # if user just removed a folder, prevent server error 500 in UI
--- a/openhands/runtime/impl/action_execution/action_execution_client.py
+++ b/openhands/runtime/impl/action_execution/action_execution_client.py
@ -264,7 +264,6 @@ class ActionExecutionClient(Runtime):
                raise AgentRuntimeTimeoutError(
                    f'Runtime failed to return execute_action before the requested timeout of {action.timeout}s'
                )
-
            return obs

    def run(self, action: CmdRunAction) -> Observation:
--- a/openhands/runtime/impl/remote/remote_runtime.py
+++ b/openhands/runtime/impl/remote/remote_runtime.py
@ -219,7 +219,9 @@ class RemoteRuntime(ActionExecutionClient):
            'image': self.container_image,
            'command': command,
            'working_dir': '/openhands/code/',
-            'environment': {'DEBUG': 'true'} if self.config.debug else {},
+            'environment': {'DEBUG': 'true'}
+            if self.config.debug or os.environ.get('DEBUG', 'false').lower() == 'true'
+            else {},
            'session_id': self.sid,
            'resource_factor': self.config.sandbox.remote_runtime_resource_factor,
        }
@ -364,7 +366,10 @@ class RemoteRuntime(ActionExecutionClient):
        try:
            return super()._send_action_server_request(method, url, **kwargs)
        except requests.Timeout:
-            self.log('error', 'No response received within the timeout period.')
+            self.log(
+                'error',
+                f'No response received within the timeout period for url: {url}',
+            )
            raise
        except requests.HTTPError as e:
            if e.response.status_code == 404:
--- a/openhands/runtime/utils/bash.py
+++ b/openhands/runtime/utils/bash.py
@ -1,18 +1,21 @@
 import os
 import re
+import time
+import uuid
+from enum import Enum

 import bashlex
-import pexpect
+import libtmux

 from openhands.core.logger import openhands_logger as logger
 from openhands.events.action import CmdRunAction
-from openhands.events.event import EventSource
-from openhands.events.observation import (
+from openhands.events.observation import ErrorObservation
+from openhands.events.observation.commands import (
+    CMD_OUTPUT_PS1_END,
+    CmdOutputMetadata,
    CmdOutputObservation,
-    ErrorObservation,
 )
-
-SOFT_TIMEOUT_SECONDS = 5
+from openhands.utils.shutdown_listener import should_continue


 def split_bash_commands(commands):
@ -66,269 +69,493 @@ def split_bash_commands(commands):
    return result


+def escape_bash_special_chars(command: str) -> str:
+    r"""
+    Escapes characters that have different interpretations in bash vs python.
+    Specifically handles escape sequences like \;, \|, \&, etc.
+    """
+    if command.strip() == '':
+        return ''
+
+    try:
+        parts = []
+        last_pos = 0
+
+        def visit_node(node):
+            nonlocal last_pos
+            if (
+                node.kind == 'redirect'
+                and hasattr(node, 'heredoc')
+                and node.heredoc is not None
+            ):
+                # We're entering a heredoc - preserve everything as-is until we see EOF
+                # Store the heredoc end marker (usually 'EOF' but could be different)
+                between = command[last_pos : node.pos[0]]
+                parts.append(between)
+                # Add the heredoc start marker
+                parts.append(command[node.pos[0] : node.heredoc.pos[0]])
+                # Add the heredoc content as-is
+                parts.append(command[node.heredoc.pos[0] : node.heredoc.pos[1]])
+                last_pos = node.pos[1]
+                return
+
+            if node.kind == 'word':
+                # Get the raw text between the last position and current word
+                between = command[last_pos : node.pos[0]]
+                word_text = command[node.pos[0] : node.pos[1]]
+
+                # Add the between text, escaping special characters
+                between = re.sub(r'\\([;&|><])', r'\\\\\1', between)
+                parts.append(between)
+
+                # Check if word_text is a quoted string or command substitution
+                if (
+                    (word_text.startswith('"') and word_text.endswith('"'))
+                    or (word_text.startswith("'") and word_text.endswith("'"))
+                    or (word_text.startswith('$(') and word_text.endswith(')'))
+                    or (word_text.startswith('`') and word_text.endswith('`'))
+                ):
+                    # Preserve quoted strings, command substitutions, and heredoc content as-is
+                    parts.append(word_text)
+                else:
+                    # Escape special chars in unquoted text
+                    word_text = re.sub(r'\\([;&|><])', r'\\\\\1', word_text)
+                    parts.append(word_text)
+
+                last_pos = node.pos[1]
+                return
+
+            # Visit child nodes
+            if hasattr(node, 'parts'):
+                for part in node.parts:
+                    visit_node(part)
+
+        # Process all nodes in the AST
+        nodes = list(bashlex.parse(command))
+        for node in nodes:
+            between = command[last_pos : node.pos[0]]
+            between = re.sub(r'\\([;&|><])', r'\\\\\1', between)
+            parts.append(between)
+            last_pos = node.pos[0]
+            visit_node(node)
+
+        # Handle any remaining text after the last word
+        remaining = command[last_pos:]
+        parts.append(remaining)
+        return ''.join(parts)
+    except bashlex.errors.ParsingError:
+        # Fallback if parsing fails
+        logger.warning(f'Failed to parse command: {command}')
+        return command
+
+
+class BashCommandStatus(Enum):
+    CONTINUE = 'continue'
+    COMPLETED = 'completed'
+    NO_CHANGE_TIMEOUT = 'no_change_timeout'
+    HARD_TIMEOUT = 'hard_timeout'
+
+
+def _remove_command_prefix(command_output: str, command: str) -> str:
+    return command_output.lstrip().removeprefix(command.lstrip()).lstrip()
+
+
 class BashSession:
-    """A class that maintains a pexpect process and provides a simple interface for running commands and interacting with the shell."""
+    POLL_INTERVAL = 0.5
+    HISTORY_LIMIT = 10_000
+    PS1 = CmdOutputMetadata.to_ps1_prompt()

-    def __init__(self, work_dir: str, username: str):
-        self._pwd = work_dir
+    def __init__(
+        self,
+        work_dir: str,
+        username: str | None = None,
+        no_change_timeout_seconds: float = 30.0,
+    ):
+        self.NO_CHANGE_TIMEOUT_SECONDS = no_change_timeout_seconds
+        self.work_dir = work_dir
+        self.username = username
+        self._initialized = False

-        self.shell = pexpect.spawn(
-            f'su {username}',
-            encoding='utf-8',
-            codec_errors='replace',
-            echo=False,
+    def initialize(self):
+        self.server = libtmux.Server()
+        window_command = '/bin/bash'
+        if self.username:
+            # This starts a non-login (new) shell for the given user
+            window_command = f'su {self.username} -'
+
+        session_name = f'openhands-{self.username}-{uuid.uuid4()}'
+        self.session = self.server.new_session(
+            session_name=session_name,
+            window_name='bash',
+            window_command=window_command,
+            start_directory=self.work_dir,
+            kill_session=True,
+            x=1000,
+            y=1000,
        )
-        self._init_bash_shell(work_dir)
+
+        # Set history limit to a large number to avoid losing history
+        # https://unix.stackexchange.com/questions/43414/unlimited-history-in-tmux
+        self.session.set_option('history-limit', str(self.HISTORY_LIMIT), _global=True)
+        self.session.history_limit = self.HISTORY_LIMIT
+        # We need to create a new pane because the initial pane's history limit is (default) 2000
+        _initial_window = self.session.attached_window
+        self.window = self.session.new_window(
+            window_shell=window_command,
+            start_directory=self.work_dir,
+        )
+        self.pane = self.window.attached_pane
+        logger.debug(f'pane: {self.pane}; history_limit: {self.session.history_limit}')
+        _initial_window.kill_window()
+
+        # Configure bash to use simple PS1 and disable PS2
+        self.pane.send_keys(
+            f'export PROMPT_COMMAND=\'export PS1="{self.PS1}"\'; export PS2=""'
+        )
+        time.sleep(0.1)  # Wait for command to take effect
+        self._clear_screen()
+
+        # Store the last command for interactive input handling
+        self.prev_status: BashCommandStatus | None = None
+        self.prev_output: str = ''
+        self._closed: bool = False
+        logger.debug(f'Bash session initialized with work dir: {self.work_dir}')
+
+        # Maintain the current working directory
+        self._cwd = os.path.abspath(self.work_dir)
+        self._initialized = True
+
+    def __del__(self):
+        """Ensure the session is closed when the object is destroyed."""
+        self.close()
+
+    def _get_pane_content(self) -> str:
+        """Capture the current pane content and update the buffer."""
+        content = '\n'.join(
+            map(
+                # avoid double newlines
+                lambda line: line.rstrip(),
+                self.pane.cmd('capture-pane', '-J', '-pS', '-').stdout,
+            )
+        )
+        return content

    def close(self):
-        self.shell.close()
+        """Clean up the session."""
+        if self._closed:
+            return
+        self.session.kill_session()
+        self._closed = True

    @property
-    def pwd(self):
-        return self._pwd
+    def cwd(self):
+        return self._cwd

-    @property
-    def workdir(self):
-        return self._get_working_directory()
+    def _is_special_key(self, command: str) -> bool:
+        """Check if the command is a special key."""
+        # Special keys are of the form C-<key>
+        _command = command.strip()
+        return _command.startswith('C-') and len(_command) == 3

-    def _get_working_directory(self):
-        # NOTE: this is part of initialization, so we hard code the timeout
-        result, exit_code = self._execute_bash('pwd', timeout=60, keep_prompt=False)
-        if exit_code != 0:
-            raise RuntimeError(
-                f'Failed to get working directory (exit code: {exit_code}): {result}'
-            )
-        return result.strip()
+    def _clear_screen(self):
+        """Clear the tmux pane screen and history."""
+        self.pane.send_keys('C-l', enter=False)
+        time.sleep(0.1)
+        self.pane.cmd('clear-history')

-    def _init_bash_shell(self, work_dir: str):
-        self.__bash_PS1 = (
-            r'[PEXPECT_BEGIN]\n'
-            r'$(which python >/dev/null 2>&1 && echo "[Python Interpreter: $(which python)]\n")'
-            r'\u@\h:\w\n'
-            r'[PEXPECT_END]'
-        )
-
-        # This should NOT match "PS1=\u@\h:\w [PEXPECT]$" when `env` is executed
-        self.__bash_expect_regex = r'\[PEXPECT_BEGIN\]\s*(.*?)\s*([a-z0-9_-]*)@([a-zA-Z0-9.-]*):(.+)\s*\[PEXPECT_END\]'
-        # Set umask to allow group write permissions
-        self.shell.sendline(f'umask 002; export PS1="{self.__bash_PS1}"; export PS2=""')
-        self.shell.expect(self.__bash_expect_regex)
-
-        self.shell.sendline(
-            f'if [ ! -d "{work_dir}" ]; then mkdir -p "{work_dir}"; fi && cd "{work_dir}"'
-        )
-        self.shell.expect(self.__bash_expect_regex)
-        logger.debug(
-            f'Bash initialized. Working directory: {work_dir}. Output: [{self.shell.before}]'
-        )
-        # Ensure the group has write permissions on the working directory
-        self.shell.sendline(f'chmod g+rw "{work_dir}"')
-        self.shell.expect(self.__bash_expect_regex)
-
-    def _get_bash_prompt_and_update_pwd(self):
-        ps1 = self.shell.after
-        if ps1 == pexpect.EOF:
-            logger.error(f'Bash shell EOF! {self.shell.after=}, {self.shell.before=}')
-            raise RuntimeError('Bash shell EOF')
-        if ps1 == pexpect.TIMEOUT:
-            logger.warning('Bash shell timeout')
-            return ''
-
-        # begin at the last occurrence of '[PEXPECT_BEGIN]'.
-        # In multi-line bash commands, the prompt will be repeated
-        # and the matched regex captures all of them
-        # - we only want the last one (newest prompt)
-        _begin_pos = ps1.rfind('[PEXPECT_BEGIN]')
-        if _begin_pos != -1:
-            ps1 = ps1[_begin_pos:]
-
-        # parse the ps1 to get username, hostname, and working directory
-        matched = re.match(self.__bash_expect_regex, ps1)
-        assert (
-            matched is not None
-        ), f'Failed to parse bash prompt: {ps1}. This should not happen.'
-        other_info, username, hostname, working_dir = matched.groups()
-        working_dir = working_dir.rstrip()
-        self._pwd = os.path.expanduser(working_dir)
-
-        # re-assemble the prompt
-        # ignore the hostname AND use 'openhands-workspace'
-        prompt = f'{other_info.strip()}\n{username}@openhands-workspace:{working_dir} '
-        if username == 'root':
-            prompt += '#'
-        else:
-            prompt += '$'
-        return prompt + ' '
-
-    def _execute_bash(
+    def _get_command_output(
        self,
        command: str,
-        timeout: int,
-        keep_prompt: bool = True,
-        kill_on_timeout: bool = True,
-    ) -> tuple[str, int]:
-        logger.debug(f'Executing command: {command}')
-        self.shell.sendline(command)
-        return self._continue_bash(
-            timeout=timeout, keep_prompt=keep_prompt, kill_on_timeout=kill_on_timeout
+        raw_command_output: str,
+        metadata: CmdOutputMetadata,
+        continue_prefix: str = '',
+    ) -> str:
+        """Get the command output with the previous command output removed.
+
+        Args:
+            command: The command that was executed.
+            raw_command_output: The raw output from the command.
+            metadata: The metadata object to store prefix/suffix in.
+            continue_prefix: The prefix to add to the command output if it's a continuation of the previous command.
+        """
+        # remove the previous command output from the new output if any
+        if self.prev_output:
+            command_output = raw_command_output.removeprefix(self.prev_output)
+            metadata.prefix = continue_prefix
+        else:
+            command_output = raw_command_output
+        self.prev_output = raw_command_output  # update current command output anyway
+        command_output = _remove_command_prefix(command_output, command)
+        return command_output.rstrip()
+
+    def _handle_completed_command(
+        self, command: str, pane_content: str, ps1_matches: list[re.Match]
+    ) -> CmdOutputObservation:
+        is_special_key = self._is_special_key(command)
+        assert len(ps1_matches) >= 1, (
+            f'Expected at least one PS1 metadata block, but got {len(ps1_matches)}.\n'
+            f'---FULL OUTPUT---\n{pane_content!r}\n---END OF OUTPUT---'
+        )
+        metadata = CmdOutputMetadata.from_ps1_match(ps1_matches[-1])
+
+        # Special case where the previous command output is truncated due to history limit
+        # We should get the content BEFORE the last PS1 prompt
+        get_content_before_last_match = bool(len(ps1_matches) == 1)
+
+        # Update the current working directory if it has changed
+        if metadata.working_dir != self._cwd and metadata.working_dir:
+            self._cwd = metadata.working_dir
+
+        logger.debug(f'COMMAND OUTPUT: {pane_content}')
+        # Extract the command output between the two PS1 prompts
+        raw_command_output = self._combine_outputs_between_matches(
+            pane_content,
+            ps1_matches,
+            get_content_before_last_match=get_content_before_last_match,
        )

-    def _interrupt_bash(
+        if get_content_before_last_match:
+            # Count the number of lines in the truncated output
+            num_lines = len(raw_command_output.splitlines())
+            metadata.prefix = f'[Previous command outputs are truncated. Showing the last {num_lines} lines of the output below.]\n'
+
+        metadata.suffix = (
+            f'\n[The command completed with exit code {metadata.exit_code}.]'
+            if not is_special_key
+            else f'\n[The command completed with exit code {metadata.exit_code}. CTRL+{command[-1].upper()} was sent.]'
+        )
+        command_output = self._get_command_output(
+            command,
+            raw_command_output,
+            metadata,
+        )
+        self.prev_status = BashCommandStatus.COMPLETED
+        self.prev_output = ''  # Reset previous command output
+        self._ready_for_next_command()
+        return CmdOutputObservation(
+            content=command_output,
+            command=command,
+            metadata=metadata,
+        )
+
+    def _handle_nochange_timeout_command(
        self,
-        action_timeout: int | None,
-        interrupt_timeout: int | None = None,
-        max_retries: int = 2,
-    ) -> tuple[str, int]:
-        interrupt_timeout = interrupt_timeout or 1  # default timeout for SIGINT
-        # try to interrupt the bash shell use SIGINT
-        while max_retries > 0:
-            self.shell.sendintr()  # send SIGINT to the shell
-            logger.debug('Sent SIGINT to bash. Waiting for output...')
-            try:
-                self.shell.expect(self.__bash_expect_regex, timeout=interrupt_timeout)
-                output = self.shell.before
-                logger.debug(f'Received output after SIGINT: {output}')
-                exit_code = 130  # SIGINT
-
-                _additional_msg = ''
-                if action_timeout is not None:
-                    _additional_msg = (
-                        f'Command timed out after {action_timeout} seconds. '
-                    )
-                output += (
-                    '\r\n\r\n'
-                    + f'[{_additional_msg}SIGINT was sent to interrupt the command.]'
-                )
-                return output, exit_code
-            except pexpect.TIMEOUT as e:
-                logger.warning(f'Bash pexpect.TIMEOUT while waiting for SIGINT: {e}')
-                max_retries -= 1
-
-        # fall back to send control-z
-        logger.error(
-            'Failed to get output after SIGINT. Max retries reached. Sending control-z...'
+        command: str,
+        pane_content: str,
+        ps1_matches: list[re.Match],
+    ) -> CmdOutputObservation:
+        self.prev_status = BashCommandStatus.NO_CHANGE_TIMEOUT
+        if len(ps1_matches) != 1:
+            logger.warning(
+                'Expected exactly one PS1 metadata block BEFORE the execution of a command, '
+                f'but got {len(ps1_matches)} PS1 metadata blocks:\n---\n{pane_content!r}\n---'
+            )
+        raw_command_output = self._combine_outputs_between_matches(
+            pane_content, ps1_matches
        )
-        self.shell.sendcontrol('z')
-        self.shell.expect(self.__bash_expect_regex)
-        output = self.shell.before
-        logger.debug(f'Received output after control-z: {output}')
-        # Try to kill the job
-        self.shell.sendline('kill -9 %1')
-        self.shell.expect(self.__bash_expect_regex)
-        logger.debug(f'Received output after killing job %1: {self.shell.before}')
-        output += self.shell.before
-
-        _additional_msg = ''
-        if action_timeout is not None:
-            _additional_msg = f'Command timed out after {action_timeout} seconds. '
-        output += (
-            '\r\n\r\n'
-            + f'[{_additional_msg}SIGINT was sent to interrupt the command, but failed. The command was killed.]'
+        metadata = CmdOutputMetadata()  # No metadata available
+        metadata.suffix = (
+            f'\n[The command has no new output after {self.NO_CHANGE_TIMEOUT_SECONDS} seconds. '
+            "You may wait longer to see additional output by sending empty command '', "
+            'send other commands to interact with the current process, '
+            'or send keys to interrupt/kill the command.]'
+        )
+        command_output = self._get_command_output(
+            command,
+            raw_command_output,
+            metadata,
+            continue_prefix='[Command output continued from previous command]\n',
+        )
+        return CmdOutputObservation(
+            content=command_output,
+            command=command,
+            metadata=metadata,
        )

-        # Try to get the exit code again
-        self.shell.sendline('echo $?')
-        self.shell.expect(self.__bash_expect_regex)
-        _exit_code_output = self.shell.before
-        exit_code = self._parse_exit_code(_exit_code_output)
-
-        return output, exit_code
-
-    def _parse_exit_code(self, output: str) -> int:
-        try:
-            exit_code = int(output.strip().split()[0])
-        except Exception:
-            logger.error('Error getting exit code from bash script')
-            # If we try to run an invalid shell script the output sometimes includes error text
-            # rather than the error code - we assume this is an error
-            exit_code = 2
-        return exit_code
-
-    def _continue_bash(
+    def _handle_hard_timeout_command(
        self,
-        timeout: int,
-        keep_prompt: bool = True,
-        kill_on_timeout: bool = True,
-    ) -> tuple[str, int]:
-        logger.debug(f'Continuing bash with timeout={timeout}')
-        try:
-            self.shell.expect(self.__bash_expect_regex, timeout=timeout)
+        command: str,
+        pane_content: str,
+        ps1_matches: list[re.Match],
+        timeout: float,
+    ) -> CmdOutputObservation:
+        self.prev_status = BashCommandStatus.HARD_TIMEOUT
+        if len(ps1_matches) != 1:
+            logger.warning(
+                'Expected exactly one PS1 metadata block BEFORE the execution of a command, '
+                f'but got {len(ps1_matches)} PS1 metadata blocks:\n---\n{pane_content!r}\n---'
+            )
+        raw_command_output = self._combine_outputs_between_matches(
+            pane_content, ps1_matches
+        )
+        metadata = CmdOutputMetadata()  # No metadata available
+        metadata.suffix = (
+            f'\n[The command timed out after {timeout} seconds. '
+            "You may wait longer to see additional output by sending empty command '', "
+            'send other commands to interact with the current process, '
+            'or send keys to interrupt/kill the command.]'
+        )
+        command_output = self._get_command_output(
+            command,
+            raw_command_output,
+            metadata,
+            continue_prefix='[Command output continued from previous command]\n',
+        )

-            output = self.shell.before
+        return CmdOutputObservation(
+            command=command,
+            content=command_output,
+            metadata=metadata,
+        )

-            # Get exit code
-            self.shell.sendline('echo $?')
-            logger.debug('Requesting exit code...')
-            self.shell.expect(self.__bash_expect_regex, timeout=timeout)
-            _exit_code_output = self.shell.before
-            exit_code = self._parse_exit_code(_exit_code_output)
-        except pexpect.TIMEOUT as e:
-            logger.warning(f'Bash pexpect.TIMEOUT while executing bash command: {e}')
-            if kill_on_timeout:
-                output, exit_code = self._interrupt_bash(action_timeout=timeout)
+    def _ready_for_next_command(self):
+        """Reset the content buffer for a new command."""
+        # Clear the current content
+        self._clear_screen()
+
+    def _combine_outputs_between_matches(
+        self,
+        pane_content: str,
+        ps1_matches: list[re.Match],
+        get_content_before_last_match: bool = False,
+    ) -> str:
+        """Combine all outputs between PS1 matches.
+
+        Args:
+            pane_content: The full pane content containing PS1 prompts and command outputs
+            ps1_matches: List of regex matches for PS1 prompts
+            get_content_before_last_match: when there's only one PS1 match, whether to get
+                the content before the last PS1 prompt (True) or after the last PS1 prompt (False)
+        Returns:
+            Combined string of all outputs between matches
+        """
+        if len(ps1_matches) == 1:
+            if get_content_before_last_match:
+                # The command output is the content before the last PS1 prompt
+                return pane_content[: ps1_matches[0].start()]
            else:
-                output = self.shell.before or ''
-                exit_code = -1
-        finally:
-            bash_prompt = self._get_bash_prompt_and_update_pwd()
-            if keep_prompt:
-                output += '\r\n' + bash_prompt
-        return output, exit_code
+                # The command output is the content after the last PS1 prompt
+                return pane_content[ps1_matches[0].end() + 1 :]
+        combined_output = ''
+        for i in range(len(ps1_matches) - 1):
+            # Extract content between current and next PS1 prompt
+            output_segment = pane_content[
+                ps1_matches[i].end() + 1 : ps1_matches[i + 1].start()
+            ]
+            combined_output += output_segment + '\n'
+        logger.debug(f'COMBINED OUTPUT: {combined_output}')
+        return combined_output

-    def run(self, action: CmdRunAction) -> CmdOutputObservation | ErrorObservation:
-        try:
-            assert (
-                action.timeout is not None
-            ), f'Timeout argument is required for CmdRunAction: {action}'
-            commands = split_bash_commands(action.command)
-            all_output = ''
-            python_interpreter = ''
-            for command in commands:
-                if command == '':
-                    output, exit_code = self._continue_bash(
-                        timeout=SOFT_TIMEOUT_SECONDS,
-                        keep_prompt=action.keep_prompt,
-                        kill_on_timeout=False,
-                    )
-                elif command.lower() == 'ctrl+c':
-                    output, exit_code = self._interrupt_bash(
-                        action_timeout=None,  # intentionally None
-                    )
-                else:
-                    output, exit_code = self._execute_bash(
-                        command,
-                        timeout=SOFT_TIMEOUT_SECONDS
-                        if not action.blocking
-                        else action.timeout,
-                        keep_prompt=action.keep_prompt,
-                        kill_on_timeout=False if not action.blocking else True,
-                    )
-                    # Get rid of the python interpreter string from each line of the output.
-                    # We need it only once at the end.
-                    parts = output.rsplit('[Python Interpreter: ', 1)
-                    output = parts[0]
-                    if len(parts) == 2:
-                        python_interpreter = '[Python Interpreter: ' + parts[1]
-                if all_output:
-                    # previous output already exists so we add a newline
-                    all_output += '\r\n'
+    def execute(self, action: CmdRunAction) -> CmdOutputObservation | ErrorObservation:
+        """Execute a command in the bash session."""
+        if not self._initialized:
+            raise RuntimeError('Bash session is not initialized')

-                # If the command originated with the agent, append the command that was run...
-                if action.source == EventSource.AGENT:
-                    all_output += command + '\r\n'
+        # Strip the command of any leading/trailing whitespace
+        logger.debug(f'RECEIVED ACTION: {action}')
+        command = action.command.strip()

-                all_output += str(output)
-                if exit_code != 0:
-                    break
+        if command == '' and self.prev_status not in {
+            BashCommandStatus.CONTINUE,
+            BashCommandStatus.NO_CHANGE_TIMEOUT,
+            BashCommandStatus.HARD_TIMEOUT,
+        }:
            return CmdOutputObservation(
-                command_id=-1,
-                content=all_output.rstrip('\r\n'),
-                command=action.command,
-                hidden=action.hidden,
-                exit_code=exit_code,
-                interpreter_details=python_interpreter,
+                content='ERROR: No previous command to continue from. '
+                + 'Previous command has to be timeout to be continued.',
+                command='',
+                metadata=CmdOutputMetadata(),
            )
-        except UnicodeDecodeError as e:
+
+        splited_commands = split_bash_commands(command)
+        if len(splited_commands) > 1:
            return ErrorObservation(
-                f'Runtime bash execution failed: Command output could not be decoded as utf-8. {str(e)}',
+                content=(
+                    f'ERROR: Cannot execute multiple commands at once.\n'
+                    f'Please run each command separately OR chain them into a single command via && or ;\n'
+                    f'Provided commands:\n{"\n".join(f"({i+1}) {cmd}" for i, cmd in enumerate(splited_commands))}'
+                )
            )
+
+        start_time = time.time()
+        last_change_time = start_time
+        last_pane_output = self._get_pane_content()
+
+        _ps1_matches = CmdOutputMetadata.matches_ps1_metadata(last_pane_output)
+        assert len(_ps1_matches) >= 1, (
+            'Expected at least one PS1 metadata block BEFORE the execution of a command, '
+            f'but got {len(_ps1_matches)} PS1 metadata blocks:\n---\n{last_pane_output!r}\n---'
+        )
+        if len(_ps1_matches) > 1:
+            logger.warning(
+                'Found multiple PS1 metadata blocks BEFORE the execution of a command. '
+                'Only the last one will be used.'
+            )
+            _ps1_matches = [_ps1_matches[-1]]
+
+        if command != '':
+            # convert command to raw string
+            command = escape_bash_special_chars(command)
+            logger.debug(f'SENDING COMMAND: {command!r}')
+            self.pane.send_keys(
+                command,
+                enter=not self._is_special_key(command),
+            )
+
+        # Loop until the command completes or times out
+        while should_continue():
+            _start_time = time.time()
+            logger.debug(f'GETTING PANE CONTENT at {_start_time}')
+            cur_pane_output = self._get_pane_content()
+            logger.debug(
+                f'PANE CONTENT GOT after {time.time() - _start_time:.2f} seconds'
+            )
+            logger.debug(f'BEGIN OF PANE CONTENT: {cur_pane_output.split("\n")[:10]}')
+            logger.debug(f'END OF PANE CONTENT: {cur_pane_output.split("\n")[-10:]}')
+            ps1_matches = CmdOutputMetadata.matches_ps1_metadata(cur_pane_output)
+            if cur_pane_output != last_pane_output:
+                last_pane_output = cur_pane_output
+                last_change_time = time.time()
+                logger.debug(f'CONTENT UPDATED DETECTED at {last_change_time}')
+
+            # 1) Execution completed
+            # if the last command output contains the end marker
+            if cur_pane_output.rstrip().endswith(CMD_OUTPUT_PS1_END.rstrip()):
+                return self._handle_completed_command(
+                    command,
+                    pane_content=cur_pane_output,
+                    ps1_matches=ps1_matches,
+                )
+
+            # 2) Execution timed out since there's no change in output
+            # for a while (self.NO_CHANGE_TIMEOUT_SECONDS)
+            # We ignore this if the command is *blocking
+            time_since_last_change = time.time() - last_change_time
+            logger.debug(
+                f'CHECKING NO CHANGE TIMEOUT ({self.NO_CHANGE_TIMEOUT_SECONDS}s): elapsed {time_since_last_change}'
+            )
+            if (
+                not action.blocking
+                and time_since_last_change >= self.NO_CHANGE_TIMEOUT_SECONDS
+            ):
+                return self._handle_nochange_timeout_command(
+                    command,
+                    pane_content=cur_pane_output,
+                    ps1_matches=ps1_matches,
+                )
+
+            # 3) Execution timed out due to hard timeout
+            logger.debug(
+                f'CHECKING HARD TIMEOUT ({action.timeout}s): elapsed {time.time() - start_time}'
+            )
+            if action.timeout and time.time() - start_time >= action.timeout:
+                return self._handle_hard_timeout_command(
+                    command,
+                    pane_content=cur_pane_output,
+                    ps1_matches=ps1_matches,
+                    timeout=action.timeout,
+                )
+
+            logger.debug(f'SLEEPING for {self.POLL_INTERVAL} seconds for next poll')
+            time.sleep(self.POLL_INTERVAL)
+        raise RuntimeError('Bash session was likely interrupted...')
--- a/openhands/runtime/utils/runtime_init.py
+++ b/openhands/runtime/utils/runtime_init.py
@ -4,7 +4,7 @@ from openhands.core.logger import openhands_logger as logger


 def init_user_and_working_directory(
-    username: str, user_id: int, initial_pwd: str
+    username: str, user_id: int, initial_cwd: str
 ) -> int | None:
    """Create working directory and user if not exists.
    It performs the following steps effectively:
@ -26,23 +26,23 @@ def init_user_and_working_directory(
    Args:
        username (str): The username to create.
        user_id (int): The user ID to assign to the user.
-        initial_pwd (str): The initial working directory to create.
+        initial_cwd (str): The initial working directory to create.

    Returns:
        int | None: The user ID if it was updated, None otherwise.
    """

    # First create the working directory, independent of the user
-    logger.debug(f'Client working directory: {initial_pwd}')
-    command = f'umask 002; mkdir -p {initial_pwd}'
+    logger.debug(f'Client working directory: {initial_cwd}')
+    command = f'umask 002; mkdir -p {initial_cwd}'
    output = subprocess.run(command, shell=True, capture_output=True)
    out_str = output.stdout.decode()

-    command = f'chown -R {username}:root {initial_pwd}'
+    command = f'chown -R {username}:root {initial_cwd}'
    output = subprocess.run(command, shell=True, capture_output=True)
    out_str += output.stdout.decode()

-    command = f'chmod g+rw {initial_pwd}'
+    command = f'chmod g+rw {initial_cwd}'
    output = subprocess.run(command, shell=True, capture_output=True)
    out_str += output.stdout.decode()
    logger.debug(f'Created working directory. Output: [{out_str}]')
--- a/openhands/runtime/utils/runtime_templates/Dockerfile.j2
+++ b/openhands/runtime/utils/runtime_templates/Dockerfile.j2
@ -15,7 +15,7 @@ ENV POETRY_VIRTUALENVS_PATH=/openhands/poetry \
 # Install base system dependencies
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
-        wget curl sudo apt-utils git jq \
+        wget curl sudo apt-utils git jq tmux \
        {% if 'ubuntu' in base_image and (base_image.endswith(':latest') or base_image.endswith(':24.04')) %}
        libgl1 \
        {% else %}
--- a/poetry.lock
+++ b/poetry.lock
@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.

 [[package]]
 name = "aiohappyeyeballs"
@ -3711,6 +3711,17 @@ websocket-client = ">=0.32.0,<0.40.0 || >0.40.0,<0.41.dev0 || >=0.43.dev0"
 [package.extras]
 adal = ["adal (>=1.0.2)"]

+[[package]]
+name = "libtmux"
+version = "0.37.0"
+description = "Typed library that provides an ORM wrapper for tmux, a terminal multiplexer."
+optional = false
+python-versions = "<4.0,>=3.8"
+files = [
+    {file = "libtmux-0.37.0-py3-none-any.whl", hash = "sha256:7e8cbab30b033d132b6fca5dddb575bb7f6a1fd802328e7174f9b49023556376"},
+    {file = "libtmux-0.37.0.tar.gz", hash = "sha256:21955c5dce6332db41abad5e26ae8c4062ef2b9a89099bd57a36f52be1d5270f"},
+]
+
 [[package]]
 name = "libwebarena"
 version = "0.0.3"
@ -8298,6 +8309,25 @@ postgresql-psycopgbinary = ["psycopg[binary] (>=3.0.7)"]
 pymysql = ["pymysql"]
 sqlcipher = ["sqlcipher3_binary"]

+[[package]]
+name = "sse-starlette"
+version = "2.1.3"
+description = "SSE plugin for Starlette"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "sse_starlette-2.1.3-py3-none-any.whl", hash = "sha256:8ec846438b4665b9e8c560fcdea6bc8081a3abf7942faa95e5a744999d219772"},
+    {file = "sse_starlette-2.1.3.tar.gz", hash = "sha256:9cd27eb35319e1414e3d2558ee7414487f9529ce3b3cf9b21434fd110e017169"},
+]
+
+[package.dependencies]
+anyio = "*"
+starlette = "*"
+uvicorn = "*"
+
+[package.extras]
+examples = ["fastapi"]
+
 [[package]]
 name = "stack-data"
 version = "0.6.3"
@ -10054,4 +10084,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.12"
-content-hash = "691bdd0f64e3476858eb34ce6ed6d0b0e7d97458cfd69fd366cd9c1c4f4ec897"
+content-hash = "db887f071f7dbb712cfba5d9b4de8938afbedee22fd166b4527f4aec40e37cfd"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "openhands-ai"
-version = "0.18.0"
+version = "0.18.0a0"
 description = "OpenHands: Code Less, Make More"
 authors = ["OpenHands"]
 license = "MIT"
@ -62,11 +62,13 @@ opentelemetry-api = "1.25.0"
 opentelemetry-exporter-otlp-proto-grpc = "1.25.0"
 modal = ">=0.66.26,<0.71.0"
 runloop-api-client = "0.11.0"
+libtmux = "^0.37.0"
 pygithub = "^2.5.0"
 joblib = "*"
 openhands-aci = "0.1.5"
 python-socketio = "^5.11.4"
 redis = "^5.2.0"
+sse-starlette = "^2.1.3"

 [tool.poetry.group.llama-index.dependencies]
 llama-index = "*"
@ -100,6 +102,7 @@ reportlab = "*"
 [tool.coverage.run]
 concurrency = ["gevent"]

+
 [tool.poetry.group.runtime.dependencies]
 jupyterlab = "*"
 notebook = "*"
@ -129,6 +132,7 @@ ignore = ["D1"]
 [tool.ruff.lint.pydocstyle]
 convention = "google"

+
 [tool.poetry.group.evaluation.dependencies]
 streamlit = "*"
 whatthepatch = "*"
--- a/tests/runtime/conftest.py
+++ b/tests/runtime/conftest.py
@ -69,7 +69,7 @@ def _close_test_runtime(runtime: Runtime) -> None:
    time.sleep(1)


-def _reset_pwd() -> None:
+def _reset_cwd() -> None:
    global project_dir
    # Try to change back to project directory
    try:
@ -152,16 +152,16 @@ def get_run_as_openhands() -> list[bool]:

@pytest.fixture(scope='module')  # for xdist
 def runtime_setup_module():
-    _reset_pwd()
+    _reset_cwd()
    yield
-    _reset_pwd()
+    _reset_cwd()


@pytest.fixture(scope='session')  # not for xdist
 def runtime_setup_session():
-    _reset_pwd()
+    _reset_cwd()
    yield
-    _reset_pwd()
+    _reset_cwd()


 # This assures that all tests run together per runtime, not alternating between them,
@ -230,14 +230,14 @@ def _load_runtime(
    global test_mount_path
    if use_workspace:
        test_mount_path = os.path.join(config.workspace_base, 'rt')
+    elif temp_dir is not None:
+        test_mount_path = os.path.join(temp_dir, sid)
    else:
-        test_mount_path = os.path.join(
-            temp_dir, sid
-        )  # need a subfolder to avoid conflicts
+        test_mount_path = None
    config.workspace_mount_path = test_mount_path

    # Mounting folder specific for this test inside the sandbox
-    config.workspace_mount_path_in_sandbox = f'{sandbox_test_folder}/{sid}'
+    config.workspace_mount_path_in_sandbox = f'{sandbox_test_folder}'
    print('\nPaths used:')
    print(f'use_host_network: {config.sandbox.use_host_network}')
    print(f'workspace_base: {config.workspace_base}')
--- a/tests/runtime/test_bash.py
+++ b/tests/runtime/test_bash.py
@ -5,7 +5,6 @@ from pathlib import Path

 import pytest
 from conftest import (
-    TEST_IN_CI,
    _close_test_runtime,
    _get_sandbox_folder,
    _load_runtime,
@ -13,7 +12,7 @@ from conftest import (

 from openhands.core.logger import openhands_logger as logger
 from openhands.events.action import CmdRunAction
-from openhands.events.observation import CmdOutputObservation
+from openhands.events.observation import CmdOutputObservation, ErrorObservation
 from openhands.runtime.base import Runtime

 # ============================================================================================================================
@ -21,36 +20,19 @@ from openhands.runtime.base import Runtime
 # ============================================================================================================================


-def _run_cmd_action(runtime, custom_command: str, keep_prompt=True):
-    action = CmdRunAction(command=custom_command, keep_prompt=keep_prompt)
+def _run_cmd_action(runtime, custom_command: str):
+    action = CmdRunAction(command=custom_command)
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
-    assert isinstance(obs, CmdOutputObservation)
+    assert isinstance(obs, (CmdOutputObservation, ErrorObservation))
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    return obs


-def test_bash_command_pexcept(temp_dir, runtime_cls, run_as_openhands):
+def test_bash_command_env(temp_dir, runtime_cls, run_as_openhands):
    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
    try:
-        # We set env var PS1="\u@\h:\w $"
-        # and construct the PEXCEPT prompt base on it.
-        # When run `env`, bad implementation of CmdRunAction will be pexcepted by this
-        # and failed to pexcept the right content, causing it fail to get error code.
        obs = runtime.run_action(CmdRunAction(command='env'))
-
-        # For example:
-        # 02:16:13 - openhands:DEBUG: client.py:78 - Executing command: env
-        # 02:16:13 - openhands:DEBUG: client.py:82 - Command output: PYTHONUNBUFFERED=1
-        # CONDA_EXE=/openhands/miniforge3/bin/conda
-        # [...]
-        # LC_CTYPE=C.UTF-8
-        # PS1=\u@\h:\w $
-        # 02:16:13 - openhands:DEBUG: client.py:89 - Executing command for exit code: env
-        # 02:16:13 - openhands:DEBUG: client.py:92 - Exit code Output:
-        # CONDA_DEFAULT_ENV=base
-
-        # As long as the exit code is 0, the test will pass.
        assert isinstance(
            obs, CmdOutputObservation
        ), 'The observation should be a CmdOutputObservation.'
@ -59,52 +41,7 @@ def test_bash_command_pexcept(temp_dir, runtime_cls, run_as_openhands):
        _close_test_runtime(runtime)


-def test_bash_timeout_and_keyboard_interrupt(temp_dir, runtime_cls, run_as_openhands):
-    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
-    try:
-        action = CmdRunAction(command='python -c "import time; time.sleep(10)"')
-        action.timeout = 1
-        obs = runtime.run_action(action)
-        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-        assert isinstance(obs, CmdOutputObservation)
-        assert (
-            '[Command timed out after 1 seconds. SIGINT was sent to interrupt the command.]'
-            in obs.content
-        )
-        assert 'KeyboardInterrupt' in obs.content
-
-        # follow up command should not be affected
-        action = CmdRunAction(command='ls')
-        action.timeout = 1
-        obs = runtime.run_action(action)
-        assert isinstance(obs, CmdOutputObservation)
-        assert obs.exit_code == 0
-        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-
-        # run it again!
-        action = CmdRunAction(command='python -c "import time; time.sleep(10)"')
-        action.timeout = 1
-        obs = runtime.run_action(action)
-        assert isinstance(obs, CmdOutputObservation)
-        assert (
-            '[Command timed out after 1 seconds. SIGINT was sent to interrupt the command.]'
-            in obs.content
-        )
-        assert 'KeyboardInterrupt' in obs.content
-
-        # things should still work
-        action = CmdRunAction(command='ls')
-        action.timeout = 1
-        obs = runtime.run_action(action)
-        assert isinstance(obs, CmdOutputObservation)
-        assert obs.exit_code == 0
-        assert '/workspace' in obs.interpreter_details
-
-    finally:
-        _close_test_runtime(runtime)
-
-
-def test_bash_pexcept_eof(temp_dir, runtime_cls, run_as_openhands):
+def test_bash_server(temp_dir, runtime_cls, run_as_openhands):
    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
    try:
        action = CmdRunAction(command='python3 -m http.server 8080')
@ -112,9 +49,21 @@ def test_bash_pexcept_eof(temp_dir, runtime_cls, run_as_openhands):
        obs = runtime.run_action(action)
        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
        assert isinstance(obs, CmdOutputObservation)
-        assert obs.exit_code == 130  # script was killed by SIGINT
+        assert obs.exit_code == -1
        assert 'Serving HTTP on 0.0.0.0 port 8080' in obs.content
+        assert (
+            "[The command timed out after 1 seconds. You may wait longer to see additional output by sending empty command '', send other commands to interact with the current process, or send keys to interrupt/kill the command.]"
+            in obs.metadata.suffix
+        )
+
+        action = CmdRunAction(command='C-c')
+        action.timeout = 30
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert isinstance(obs, CmdOutputObservation)
+        assert obs.exit_code == 0
        assert 'Keyboard interrupt received, exiting.' in obs.content
+        assert '/workspace' in obs.metadata.working_dir

        action = CmdRunAction(command='ls')
        action.timeout = 1
@ -122,7 +71,8 @@ def test_bash_pexcept_eof(temp_dir, runtime_cls, run_as_openhands):
        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
        assert isinstance(obs, CmdOutputObservation)
        assert obs.exit_code == 0
-        assert '/workspace' in obs.interpreter_details
+        assert 'Keyboard interrupt received, exiting.' not in obs.content
+        assert '/workspace' in obs.metadata.working_dir

        # run it again!
        action = CmdRunAction(command='python3 -m http.server 8080')
@ -130,122 +80,8 @@ def test_bash_pexcept_eof(temp_dir, runtime_cls, run_as_openhands):
        obs = runtime.run_action(action)
        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
        assert isinstance(obs, CmdOutputObservation)
-        assert obs.exit_code == 130  # script was killed by SIGINT
+        assert obs.exit_code == -1
        assert 'Serving HTTP on 0.0.0.0 port 8080' in obs.content
-        assert 'Keyboard interrupt received, exiting.' in obs.content
-
-        # things should still work
-        action = CmdRunAction(command='ls')
-        action.timeout = 1
-        obs = runtime.run_action(action)
-        assert isinstance(obs, CmdOutputObservation)
-        assert obs.exit_code == 0
-        assert '/workspace' in obs.interpreter_details
-    finally:
-        _close_test_runtime(runtime)
-
-
-def test_process_resistant_to_one_sigint(temp_dir, runtime_cls, run_as_openhands):
-    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
-    try:
-        # Create a bash script that ignores SIGINT up to 1 times
-        script_content = """
-#!/bin/bash
-trap_count=0
-trap 'echo "Caught SIGINT ($((++trap_count))/1), ignoring..."; [ $trap_count -ge 1 ] && trap - INT && exit' INT
-while true; do
-    echo "Still running..."
-    sleep 1
-done
-        """.strip()
-
-        with open(f'{temp_dir}/resistant_script.sh', 'w') as f:
-            f.write(script_content)
-        os.chmod(f'{temp_dir}/resistant_script.sh', 0o777)
-
-        runtime.copy_to(
-            os.path.join(temp_dir, 'resistant_script.sh'),
-            runtime.config.workspace_mount_path_in_sandbox,
-        )
-
-        # Run the resistant script
-        action = CmdRunAction(command='sudo bash ./resistant_script.sh')
-        action.timeout = 5
-        action.blocking = True
-        logger.info(action, extra={'msg_type': 'ACTION'})
-        obs = runtime.run_action(action)
-        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-        assert isinstance(obs, CmdOutputObservation)
-        assert obs.exit_code == 130  # script was killed by SIGINT
-        assert 'Still running...' in obs.content
-        assert 'Caught SIGINT (1/1), ignoring...' in obs.content
-        assert 'Stopped' not in obs.content
-        assert (
-            '[Command timed out after 5 seconds. SIGINT was sent to interrupt the command.]'
-            in obs.content
-        )
-
-        # Normal command should still work
-        action = CmdRunAction(command='ls')
-        action.timeout = 10
-        obs = runtime.run_action(action)
-        assert isinstance(obs, CmdOutputObservation)
-        assert obs.exit_code == 0
-        assert '/workspace' in obs.interpreter_details
-        assert 'resistant_script.sh' in obs.content
-
-    finally:
-        _close_test_runtime(runtime)
-
-
-def test_process_resistant_to_multiple_sigint(temp_dir, runtime_cls, run_as_openhands):
-    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
-    try:
-        # Create a bash script that ignores SIGINT up to 2 times
-        script_content = """
-#!/bin/bash
-trap_count=0
-trap 'echo "Caught SIGINT ($((++trap_count))/3), ignoring..."; [ $trap_count -ge 3 ] && trap - INT && exit' INT
-while true; do
-    echo "Still running..."
-    sleep 1
-done
-        """.strip()
-
-        with open(f'{temp_dir}/resistant_script.sh', 'w') as f:
-            f.write(script_content)
-        os.chmod(f'{temp_dir}/resistant_script.sh', 0o777)
-
-        runtime.copy_to(
-            os.path.join(temp_dir, 'resistant_script.sh'),
-            runtime.config.workspace_mount_path_in_sandbox,
-        )
-
-        # Run the resistant script
-        action = CmdRunAction(command='sudo bash ./resistant_script.sh')
-        action.timeout = 2
-        action.blocking = True
-        logger.info(action, extra={'msg_type': 'ACTION'})
-        obs = runtime.run_action(action)
-        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-        assert isinstance(obs, CmdOutputObservation)
-        assert obs.exit_code == 0
-        assert 'Still running...' in obs.content
-        assert 'Caught SIGINT (1/3), ignoring...' in obs.content
-        assert '[1]+' and 'Stopped' in obs.content
-        assert (
-            '[Command timed out after 2 seconds. SIGINT was sent to interrupt the command, but failed. The command was killed.]'
-            in obs.content
-        )
-
-        # Normal command should still work
-        action = CmdRunAction(command='ls')
-        action.timeout = 10
-        obs = runtime.run_action(action)
-        assert isinstance(obs, CmdOutputObservation)
-        assert obs.exit_code == 0
-        assert '/workspace' in obs.interpreter_details
-        assert 'resistant_script.sh' in obs.content

    finally:
        _close_test_runtime(runtime)
@ -262,12 +98,12 @@ def test_multiline_commands(temp_dir, runtime_cls):
        # test multiline echo
        obs = _run_cmd_action(runtime, 'echo -e "hello\nworld"')
        assert obs.exit_code == 0, 'The exit code should be 0.'
-        assert 'hello\r\nworld' in obs.content
+        assert 'hello\nworld' in obs.content

        # test whitespace
        obs = _run_cmd_action(runtime, 'echo -e "a\\n\\n\\nz"')
        assert obs.exit_code == 0, 'The exit code should be 0.'
-        assert '\r\n\r\n\r\n' in obs.content
+        assert '\n\n\n' in obs.content
    finally:
        _close_test_runtime(runtime)

@ -276,43 +112,43 @@ def test_multiple_multiline_commands(temp_dir, runtime_cls, run_as_openhands):
    cmds = [
        'ls -l',
        'echo -e "hello\nworld"',
-        """
-echo -e "hello it\\'s me"
-""".strip(),
-        """
-echo \\
+        """echo -e "hello it's me\"""",
+        """echo \\
    -e 'hello' \\
-    -v
-""".strip(),
-        """
-echo -e 'hello\\nworld\\nare\\nyou\\nthere?'
-""".strip(),
-        """
-echo -e 'hello
-world
-are
-you\\n
-there?'
-""".strip(),
-        """
-echo -e 'hello
-world "
-'
-""".strip(),
+    -v""",
+        """echo -e 'hello\\nworld\\nare\\nyou\\nthere?'""",
+        """echo -e 'hello\nworld\nare\nyou\n\nthere?'""",
+        """echo -e 'hello\nworld "'""",
    ]
    joined_cmds = '\n'.join(cmds)

    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
    try:
+        # First test that running multiple commands at once fails
        obs = _run_cmd_action(runtime, joined_cmds)
-        assert obs.exit_code == 0, 'The exit code should be 0.'
+        assert isinstance(obs, ErrorObservation)
+        assert 'Cannot execute multiple commands at once' in obs.content

-        assert 'total 0' in obs.content
-        assert 'hello\r\nworld' in obs.content
-        assert "hello it\\'s me" in obs.content
-        assert 'hello -v' in obs.content
-        assert 'hello\r\nworld\r\nare\r\nyou\r\nthere?' in obs.content
-        assert 'hello\r\nworld\r\nare\r\nyou\r\n\r\nthere?' in obs.content
+        # Now run each command individually and verify they work
+        results = []
+        for cmd in cmds:
+            obs = _run_cmd_action(runtime, cmd)
+            assert isinstance(obs, CmdOutputObservation)
+            assert obs.exit_code == 0
+            results.append(obs.content)
+
+        # Verify all expected outputs are present
+        assert 'total 0' in results[0]  # ls -l
+        assert 'hello\nworld' in results[1]  # echo -e "hello\nworld"
+        assert "hello it's me" in results[2]  # echo -e "hello it\'s me"
+        assert 'hello -v' in results[3]  # echo -e 'hello' -v
+        assert (
+            'hello\nworld\nare\nyou\nthere?' in results[4]
+        )  # echo -e 'hello\nworld\nare\nyou\nthere?'
+        assert (
+            'hello\nworld\nare\nyou\n\nthere?' in results[5]
+        )  # echo -e with literal newlines
+        assert 'hello\nworld "' in results[6]  # echo -e with quote
    finally:
        _close_test_runtime(runtime)

@ -324,7 +160,7 @@ def test_no_ps2_in_output(temp_dir, runtime_cls, run_as_openhands):
        obs = _run_cmd_action(runtime, 'echo -e "hello\nworld"')
        assert obs.exit_code == 0, 'The exit code should be 0.'

-        assert 'hello\r\nworld' in obs.content
+        assert 'hello\nworld' in obs.content
        assert '>' not in obs.content
    finally:
        _close_test_runtime(runtime)
@ -332,21 +168,17 @@ def test_no_ps2_in_output(temp_dir, runtime_cls, run_as_openhands):

 def test_multiline_command_loop(temp_dir, runtime_cls):
    # https://github.com/All-Hands-AI/OpenHands/issues/3143
-    init_cmd = """
-mkdir -p _modules && \
+    init_cmd = """mkdir -p _modules && \
 for month in {01..04}; do
    for day in {01..05}; do
        touch "_modules/2024-${month}-${day}-sample.md"
    done
-done
-echo "created files"
+done && echo "created files"
 """
-    follow_up_cmd = """
-for file in _modules/*.md; do
+    follow_up_cmd = """for file in _modules/*.md; do
    new_date=$(echo $file | sed -E 's/2024-(01|02|03|04)-/2024-/;s/2024-01/2024-08/;s/2024-02/2024-09/;s/2024-03/2024-10/;s/2024-04/2024-11/')
    mv "$file" "$new_date"
-done
-echo "success"
+done && echo "success"
 """
    runtime = _load_runtime(temp_dir, runtime_cls)
    try:
@ -424,7 +256,6 @@ def test_multi_cmd_run_in_single_line(temp_dir, runtime_cls):

 def test_stateful_cmd(temp_dir, runtime_cls):
    runtime = _load_runtime(temp_dir, runtime_cls)
-    sandbox_dir = _get_sandbox_folder(runtime)
    try:
        obs = _run_cmd_action(runtime, 'mkdir -p test')
        assert obs.exit_code == 0, 'The exit code should be 0.'
@ -434,7 +265,7 @@ def test_stateful_cmd(temp_dir, runtime_cls):

        obs = _run_cmd_action(runtime, 'pwd')
        assert obs.exit_code == 0, 'The exit code should be 0.'
-        assert f'{sandbox_dir}/test' in obs.content
+        assert '/workspace/test' in obs.content
    finally:
        _close_test_runtime(runtime)

@ -532,7 +363,7 @@ def test_copy_to_non_existent_directory(temp_dir, runtime_cls):
 def test_overwrite_existing_file(temp_dir, runtime_cls):
    runtime = _load_runtime(temp_dir, runtime_cls)
    try:
-        sandbox_dir = _get_sandbox_folder(runtime)
+        sandbox_dir = '/openhands/workspace'

        obs = _run_cmd_action(runtime, f'ls -alh {sandbox_dir}')
        assert obs.exit_code == 0
@ -595,38 +426,13 @@ def test_copy_from_directory(temp_dir, runtime_cls):
        _close_test_runtime(runtime)


-def test_keep_prompt(runtime_cls, temp_dir):
-    runtime = _load_runtime(
-        temp_dir,
-        runtime_cls=runtime_cls,
-        run_as_openhands=False,
-    )
-    try:
-        sandbox_dir = _get_sandbox_folder(runtime)
-
-        obs = _run_cmd_action(runtime, f'touch {sandbox_dir}/test_file.txt')
-        assert obs.exit_code == 0
-        assert 'root@' in obs.interpreter_details
-
-        obs = _run_cmd_action(
-            runtime, f'cat {sandbox_dir}/test_file.txt', keep_prompt=False
-        )
-        assert obs.exit_code == 0
-        assert 'root@' not in obs.interpreter_details
-    finally:
-        _close_test_runtime(runtime)
-
-
-@pytest.mark.skipif(
-    TEST_IN_CI != 'True',
-    reason='This test is not working in WSL (file ownership)',
-)
 def test_git_operation(runtime_cls):
    # do not mount workspace, since workspace mount by tests will be owned by root
    # while the user_id we get via os.getuid() is different from root
    # which causes permission issues
    runtime = _load_runtime(
        temp_dir=None,
+        use_workspace=False,
        runtime_cls=runtime_cls,
        # Need to use non-root user to expose issues
        run_as_openhands=True,
@ -634,12 +440,15 @@ def test_git_operation(runtime_cls):
    # this will happen if permission of runtime is not properly configured
    # fatal: detected dubious ownership in repository at '/workspace'
    try:
+        obs = _run_cmd_action(runtime, 'sudo chown -R openhands:root .')
+        assert obs.exit_code == 0
+
        # check the ownership of the current directory
        obs = _run_cmd_action(runtime, 'ls -alh .')
        assert obs.exit_code == 0
        # drwx--S--- 2 openhands root   64 Aug  7 23:32 .
        # drwxr-xr-x 1 root      root 4.0K Aug  7 23:33 ..
-        for line in obs.content.split('\r\n'):
+        for line in obs.content.split('\n'):
            if ' ..' in line:
                # parent directory should be owned by root
                assert 'root' in line
@ -663,8 +472,10 @@ def test_git_operation(runtime_cls):
        assert obs.exit_code == 0

        # git diff
-        obs = _run_cmd_action(runtime, 'git diff')
+        obs = _run_cmd_action(runtime, 'git diff --no-color --cached')
        assert obs.exit_code == 0
+        assert 'b/test_file.txt' in obs.content
+        assert '+hello' in obs.content

        # git commit
        obs = _run_cmd_action(runtime, 'git commit -m "test commit"')
@ -685,3 +496,276 @@ def test_python_version(temp_dir, runtime_cls, run_as_openhands):
        assert 'Python 3' in obs.content, 'The output should contain "Python 3".'
    finally:
        _close_test_runtime(runtime)
+
+
+def test_pwd_property(temp_dir, runtime_cls, run_as_openhands):
+    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
+    try:
+        # Create a subdirectory and verify pwd updates
+        obs = _run_cmd_action(runtime, 'mkdir -p random_dir')
+        assert obs.exit_code == 0
+
+        obs = _run_cmd_action(runtime, 'cd random_dir && pwd')
+        assert obs.exit_code == 0
+        assert 'random_dir' in obs.content
+    finally:
+        _close_test_runtime(runtime)
+
+
+def test_basic_command(temp_dir, runtime_cls, run_as_openhands):
+    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
+    try:
+        # Test simple command
+        obs = _run_cmd_action(runtime, "echo 'hello world'")
+        assert 'hello world' in obs.content
+        assert obs.exit_code == 0
+
+        # Test command with error
+        obs = _run_cmd_action(runtime, 'nonexistent_command')
+        assert obs.exit_code == 127
+        assert 'nonexistent_command: command not found' in obs.content
+
+        # Test command with special characters
+        obs = _run_cmd_action(runtime, "echo 'hello   world    with\nspecial  chars'")
+        assert 'hello   world    with\nspecial  chars' in obs.content
+        assert obs.exit_code == 0
+
+        # Test multiple commands in sequence
+        obs = _run_cmd_action(runtime, 'echo "first" && echo "second" && echo "third"')
+        assert 'first' in obs.content
+        assert 'second' in obs.content
+        assert 'third' in obs.content
+        assert obs.exit_code == 0
+    finally:
+        _close_test_runtime(runtime)
+
+
+def test_interactive_command(temp_dir, runtime_cls, run_as_openhands):
+    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
+    try:
+        # Test interactive command
+        action = CmdRunAction('read -p "Enter name: " name && echo "Hello $name"')
+        action.timeout = 1
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        # assert 'Enter name:' in obs.content # FIXME: this is not working
+        assert '[The command timed out after 1 seconds.' in obs.metadata.suffix
+
+        action = CmdRunAction('John')
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert 'Hello John' in obs.content
+        assert '[The command completed with exit code 0.]' in obs.metadata.suffix
+
+        # Test multiline command input with here document
+        action = CmdRunAction("""cat << EOF
+line 1
+line 2
+EOF""")
+        obs = runtime.run_action(action)
+        assert 'line 1\nline 2' in obs.content
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert '[The command completed with exit code 0.]' in obs.metadata.suffix
+        assert obs.exit_code == 0
+    finally:
+        _close_test_runtime(runtime)
+
+
+def test_long_output(temp_dir, runtime_cls, run_as_openhands):
+    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
+    try:
+        # Generate a long output
+        action = CmdRunAction('for i in $(seq 1 5000); do echo "Line $i"; done')
+        action.timeout = 10
+        obs = runtime.run_action(action)
+        assert obs.exit_code == 0
+        assert 'Line 1' in obs.content
+        assert 'Line 5000' in obs.content
+    finally:
+        _close_test_runtime(runtime)
+
+
+def test_long_output_exceed_history_limit(temp_dir, runtime_cls, run_as_openhands):
+    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
+    try:
+        # Generate a long output
+        action = CmdRunAction('for i in $(seq 1 50000); do echo "Line $i"; done')
+        action.timeout = 30
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert obs.exit_code == 0
+        assert 'Previous command outputs are truncated' in obs.metadata.prefix
+        assert 'Line 40000' in obs.content
+        assert 'Line 50000' in obs.content
+    finally:
+        _close_test_runtime(runtime)
+
+
+def test_long_output_from_nested_directories(temp_dir, runtime_cls, run_as_openhands):
+    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
+    try:
+        # Create nested directories with many files
+        setup_cmd = 'mkdir -p /tmp/test_dir && cd /tmp/test_dir && for i in $(seq 1 100); do mkdir -p "folder_$i"; for j in $(seq 1 100); do touch "folder_$i/file_$j.txt"; done; done'
+        setup_action = CmdRunAction(setup_cmd.strip())
+        setup_action.timeout = 60
+        obs = runtime.run_action(setup_action)
+        assert obs.exit_code == 0
+
+        # List the directory structure recursively
+        action = CmdRunAction('ls -R /tmp/test_dir')
+        action.timeout = 60
+        obs = runtime.run_action(action)
+        assert obs.exit_code == 0
+
+        # Verify output contains expected files
+        assert 'folder_1' in obs.content
+        assert 'file_1.txt' in obs.content
+        assert 'folder_100' in obs.content
+        assert 'file_100.txt' in obs.content
+    finally:
+        _close_test_runtime(runtime)
+
+
+def test_command_backslash(temp_dir, runtime_cls, run_as_openhands):
+    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
+    try:
+        # Create a file with the content "implemented_function"
+        action = CmdRunAction(
+            'mkdir -p /tmp/test_dir && echo "implemented_function" > /tmp/test_dir/file_1.txt'
+        )
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert obs.exit_code == 0
+
+        # Reproduce an issue we ran into during evaluation
+        # find /workspace/sympy__sympy__1.0 -type f -exec grep -l "implemented_function" {} \;
+        # find: missing argument to `-exec'
+        # --> This is unexpected output due to incorrect escaping of \;
+        # This tests for correct escaping of \;
+        action = CmdRunAction(
+            'find /tmp/test_dir -type f -exec grep -l "implemented_function" {} \\;'
+        )
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert obs.exit_code == 0
+        assert '/tmp/test_dir/file_1.txt' in obs.content
+    finally:
+        _close_test_runtime(runtime)
+
+
+def test_command_output_continuation(temp_dir, runtime_cls, run_as_openhands):
+    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
+    try:
+        # Start a command that produces output slowly
+        action = CmdRunAction('for i in {1..5}; do echo $i; sleep 3; done')
+        action.timeout = 2.5  # Set timeout to 2.5 seconds
+        obs = runtime.run_action(action)
+        assert obs.content.strip() == '1'
+        assert obs.metadata.prefix == ''
+        assert '[The command timed out after 2.5 seconds.' in obs.metadata.suffix
+
+        # Continue watching output
+        action = CmdRunAction('')
+        action.timeout = 2.5
+        obs = runtime.run_action(action)
+        assert '[Command output continued from previous command]' in obs.metadata.prefix
+        assert obs.content.strip() == '2'
+        assert '[The command timed out after 2.5 seconds.' in obs.metadata.suffix
+
+        # Continue until completion
+        for expected in ['3', '4', '5']:
+            action = CmdRunAction('')
+            action.timeout = 2.5
+            obs = runtime.run_action(action)
+            assert (
+                '[Command output continued from previous command]'
+                in obs.metadata.prefix
+            )
+            assert obs.content.strip() == expected
+            assert '[The command timed out after 2.5 seconds.' in obs.metadata.suffix
+
+        # Final empty command to complete
+        action = CmdRunAction('')
+        obs = runtime.run_action(action)
+        assert '[The command completed with exit code 0.]' in obs.metadata.suffix
+    finally:
+        _close_test_runtime(runtime)
+
+
+def test_long_running_command_follow_by_execute(
+    temp_dir, runtime_cls, run_as_openhands
+):
+    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
+    try:
+        # Test command that produces output slowly
+        action = CmdRunAction('for i in {1..3}; do echo $i; sleep 3; done')
+        action.timeout = 2.5
+        action.blocking = False
+        obs = runtime.run_action(action)
+        assert '1' in obs.content  # First number should appear before timeout
+        assert obs.metadata.exit_code == -1  # -1 indicates command is still running
+        assert '[The command timed out after 2.5 seconds.' in obs.metadata.suffix
+        assert obs.metadata.prefix == ''
+
+        # Continue watching output
+        action = CmdRunAction('')
+        action.timeout = 2.5
+        obs = runtime.run_action(action)
+        assert '2' in obs.content
+        assert (
+            obs.metadata.prefix == '[Command output continued from previous command]\n'
+        )
+        assert '[The command timed out after 2.5 seconds.' in obs.metadata.suffix
+        assert obs.metadata.exit_code == -1  # -1 indicates command is still running
+
+        # Test command that produces no output
+        action = CmdRunAction('sleep 15')
+        action.timeout = 2.5
+        obs = runtime.run_action(action)
+        assert '3' in obs.content
+        assert (
+            obs.metadata.prefix == '[Command output continued from previous command]\n'
+        )
+        assert '[The command timed out after 2.5 seconds.' in obs.metadata.suffix
+        assert obs.metadata.exit_code == -1  # -1 indicates command is still running
+    finally:
+        _close_test_runtime(runtime)
+
+
+def test_empty_command_errors(temp_dir, runtime_cls, run_as_openhands):
+    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
+    try:
+        # Test empty command without previous command
+        obs = runtime.run_action(CmdRunAction(''))
+        assert isinstance(obs, CmdOutputObservation)
+        assert 'ERROR: No previous command to continue from' in obs.content
+    finally:
+        _close_test_runtime(runtime)
+
+
+def test_python_interactive_input(temp_dir, runtime_cls, run_as_openhands):
+    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
+    try:
+        # Test Python program that asks for input - properly escaped for bash
+        python_script = """name = input('Enter your name: '); age = input('Enter your age: '); print(f'Hello {name}, you are {age} years old')"""
+
+        # Start Python with the interactive script
+        obs = runtime.run_action(CmdRunAction(f'python3 -c "{python_script}"'))
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert 'Enter your name:' in obs.content
+        assert obs.metadata.exit_code == -1  # -1 indicates command is still running
+
+        # Send first input (name)
+        obs = runtime.run_action(CmdRunAction('Alice'))
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert 'Enter your age:' in obs.content
+        assert obs.metadata.exit_code == -1
+
+        # Send second input (age)
+        obs = runtime.run_action(CmdRunAction('25'))
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert 'Hello Alice, you are 25 years old' in obs.content
+        assert obs.metadata.exit_code == 0
+        assert '[The command completed with exit code 0.]' in obs.metadata.suffix
+    finally:
+        _close_test_runtime(runtime)
--- a/tests/runtime/test_ipython.py
+++ b/tests/runtime/test_ipython.py
@ -4,7 +4,6 @@ import pytest
 from conftest import (
    TEST_IN_CI,
    _close_test_runtime,
-    _get_sandbox_folder,
    _load_runtime,
 )

@ -33,8 +32,6 @@ from openhands.events.observation import (
 def test_simple_cmd_ipython_and_fileop(temp_dir, runtime_cls, run_as_openhands):
    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)

-    sandbox_dir = _get_sandbox_folder(runtime)
-
    # Test run command
    action_cmd = CmdRunAction(command='ls -l')
    logger.info(action_cmd, extra={'msg_type': 'ACTION'})
@ -55,7 +52,7 @@ def test_simple_cmd_ipython_and_fileop(temp_dir, runtime_cls, run_as_openhands):
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    assert obs.content.strip() == (
        'Hello, `World`!\n'
-        f'[Jupyter current working directory: {sandbox_dir}]\n'
+        '[Jupyter current working directory: /openhands/workspace]\n'
        '[Jupyter Python interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.12/bin/python]'
    )

@ -76,7 +73,7 @@ def test_simple_cmd_ipython_and_fileop(temp_dir, runtime_cls, run_as_openhands):

    assert obs.content == ''
    # event stream runtime will always use absolute path
-    assert obs.path == f'{sandbox_dir}/hello.sh'
+    assert obs.path == '/openhands/workspace/hello.sh'

    # Test read file (file should exist)
    action_read = FileReadAction(path='hello.sh')
@ -88,7 +85,7 @@ def test_simple_cmd_ipython_and_fileop(temp_dir, runtime_cls, run_as_openhands):
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})

    assert obs.content == 'echo "Hello, World!"\n'
-    assert obs.path == f'{sandbox_dir}/hello.sh'
+    assert obs.path == '/openhands/workspace/hello.sh'

    # clean up
    action = CmdRunAction(command='rm -rf hello.sh')
@ -178,7 +175,6 @@ def test_ipython_multi_user(temp_dir, runtime_cls, run_as_openhands):

 def test_ipython_simple(temp_dir, runtime_cls):
    runtime = _load_runtime(temp_dir, runtime_cls)
-    sandbox_dir = _get_sandbox_folder(runtime)

    # Test run ipython
    # get username
@ -192,7 +188,7 @@ def test_ipython_simple(temp_dir, runtime_cls):
        obs.content.strip()
        == (
            '1\n'
-            f'[Jupyter current working directory: {sandbox_dir}]\n'
+            '[Jupyter current working directory: /openhands/workspace]\n'
            '[Jupyter Python interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.12/bin/python]'
        ).strip()
    )
@ -203,7 +199,6 @@ def test_ipython_simple(temp_dir, runtime_cls):
 def test_ipython_package_install(temp_dir, runtime_cls, run_as_openhands):
    """Make sure that cd in bash also update the current working directory in ipython."""
    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
-    sandbox_dir = _get_sandbox_folder(runtime)

    # It should error out since pymsgbox is not installed
    action = IPythonRunCellAction(code='import pymsgbox')
@ -229,7 +224,7 @@ def test_ipython_package_install(temp_dir, runtime_cls, run_as_openhands):
    # import should not error out
    assert obs.content.strip() == (
        '[Code executed successfully with no output]\n'
-        f'[Jupyter current working directory: {sandbox_dir}]\n'
+        '[Jupyter current working directory: /openhands/workspace]\n'
        '[Jupyter Python interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.12/bin/python]'
    )

@ -239,7 +234,6 @@ def test_ipython_package_install(temp_dir, runtime_cls, run_as_openhands):
 def test_ipython_file_editor_permissions_as_openhands(temp_dir, runtime_cls):
    """Test file editor permission behavior when running as different users."""
    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands=True)
-    sandbox_dir = _get_sandbox_folder(runtime)

    # Create a file owned by root with restricted permissions
    action = CmdRunAction(
@ -277,18 +271,18 @@ def test_ipython_file_editor_permissions_as_openhands(temp_dir, runtime_cls):
    assert 'Permission denied' in obs.content

    # Try to use file editor in openhands sandbox directory - should work
-    test_code = f"""
+    test_code = """
 # Create file
-print(file_editor(command='create', path='{sandbox_dir}/test.txt', file_text='Line 1\\nLine 2\\nLine 3'))
+print(file_editor(command='create', path='/openhands/workspace/test.txt', file_text='Line 1\\nLine 2\\nLine 3'))

 # View file
-print(file_editor(command='view', path='{sandbox_dir}/test.txt'))
+print(file_editor(command='view', path='/openhands/workspace/test.txt'))

 # Edit file
-print(file_editor(command='str_replace', path='{sandbox_dir}/test.txt', old_str='Line 2', new_str='New Line 2'))
+print(file_editor(command='str_replace', path='/openhands/workspace/test.txt', old_str='Line 2', new_str='New Line 2'))

 # Undo edit
-print(file_editor(command='undo_edit', path='{sandbox_dir}/test.txt'))
+print(file_editor(command='undo_edit', path='/openhands/workspace/test.txt'))
 """
    action = IPythonRunCellAction(code=test_code)
    logger.info(action, extra={'msg_type': 'ACTION'})
@ -303,7 +297,7 @@ print(file_editor(command='undo_edit', path='{sandbox_dir}/test.txt'))
    assert 'undone successfully' in obs.content

    # Clean up
-    action = CmdRunAction(command=f'rm -f {sandbox_dir}/test.txt')
+    action = CmdRunAction(command='rm -f /openhands/workspace/test.txt')
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@ -318,9 +312,9 @@ print(file_editor(command='undo_edit', path='{sandbox_dir}/test.txt'))
    _close_test_runtime(runtime)


-def test_file_read_and_edit_via_oh_aci(temp_dir, runtime_cls, run_as_openhands):
-    runtime = _load_runtime(temp_dir, runtime_cls, run_as_openhands)
-    sandbox_dir = _get_sandbox_folder(runtime)
+def test_file_read_and_edit_via_oh_aci(runtime_cls, run_as_openhands):
+    runtime = _load_runtime(None, runtime_cls, run_as_openhands)
+    sandbox_dir = '/openhands/workspace'

    actions = [
        {
--- a/tests/unit/resolver/test_resolve_issues.py
+++ b/tests/unit/resolver/test_resolve_issues.py
@ -6,7 +6,11 @@ import pytest

 from openhands.core.config import LLMConfig
 from openhands.events.action import CmdRunAction
-from openhands.events.observation import CmdOutputObservation, NullObservation
+from openhands.events.observation import (
+    CmdOutputMetadata,
+    CmdOutputObservation,
+    NullObservation,
+)
 from openhands.llm.llm import LLM
 from openhands.resolver.github_issue import GithubIssue, ReviewThread
 from openhands.resolver.issue_definitions import IssueHandler, PRHandler
@ -55,23 +59,20 @@ def mock_followup_prompt_template():
    return 'Issue context: {{ issues }}\n\nReview comments: {{ review_comments }}\n\nReview threads: {{ review_threads }}\n\nFiles: {{ files }}\n\nThread comments: {{ thread_context }}\n\nPlease fix this issue.'


-def create_cmd_output(exit_code: int, content: str, command_id: int, command: str):
+def create_cmd_output(exit_code: int, content: str, command: str):
    return CmdOutputObservation(
-        exit_code=exit_code, content=content, command_id=command_id, command=command
+        content=content,
+        command=command,
+        metadata=CmdOutputMetadata(exit_code=exit_code),
    )


 def test_initialize_runtime():
    mock_runtime = MagicMock()
    mock_runtime.run_action.side_effect = [
+        create_cmd_output(exit_code=0, content='', command='cd /workspace'),
        create_cmd_output(
-            exit_code=0, content='', command_id=1, command='cd /workspace'
-        ),
-        create_cmd_output(
-            exit_code=0,
-            content='',
-            command_id=2,
-            command='git config --global core.pager ""',
+            exit_code=0, content='', command='git config --global core.pager ""'
        ),
    ]

@ -291,30 +292,19 @@ def test_download_pr_from_github():
 async def test_complete_runtime():
    mock_runtime = MagicMock()
    mock_runtime.run_action.side_effect = [
+        create_cmd_output(exit_code=0, content='', command='cd /workspace'),
        create_cmd_output(
-            exit_code=0, content='', command_id=1, command='cd /workspace'
+            exit_code=0, content='', command='git config --global core.pager ""'
        ),
        create_cmd_output(
            exit_code=0,
            content='',
-            command_id=2,
-            command='git config --global core.pager ""',
-        ),
-        create_cmd_output(
-            exit_code=0,
-            content='',
-            command_id=3,
            command='git config --global --add safe.directory /workspace',
        ),
        create_cmd_output(
-            exit_code=0,
-            content='',
-            command_id=4,
-            command='git diff base_commit_hash fix',
-        ),
-        create_cmd_output(
-            exit_code=0, content='git diff content', command_id=5, command='git apply'
+            exit_code=0, content='', command='git diff base_commit_hash fix'
        ),
+        create_cmd_output(exit_code=0, content='git diff content', command='git apply'),
    ]

    result = await complete_runtime(mock_runtime, 'base_commit_hash')
@ -614,11 +604,7 @@ def test_guess_success():
        title='Test Issue',
        body='This is a test issue',
    )
-    mock_history = [
-        create_cmd_output(
-            exit_code=0, content='', command_id=1, command='cd /workspace'
-        )
-    ]
+    mock_history = [create_cmd_output(exit_code=0, content='', command='cd /workspace')]
    mock_llm_config = LLMConfig(model='test_model', api_key='test_api_key')

    mock_completion_response = MagicMock()
@ -758,11 +744,7 @@ def test_guess_success_negative_case():
        title='Test Issue',
        body='This is a test issue',
    )
-    mock_history = [
-        create_cmd_output(
-            exit_code=0, content='', command_id=1, command='cd /workspace'
-        )
-    ]
+    mock_history = [create_cmd_output(exit_code=0, content='', command='cd /workspace')]
    mock_llm_config = LLMConfig(model='test_model', api_key='test_api_key')

    mock_completion_response = MagicMock()
@ -795,11 +777,7 @@ def test_guess_success_invalid_output():
        title='Test Issue',
        body='This is a test issue',
    )
-    mock_history = [
-        create_cmd_output(
-            exit_code=0, content='', command_id=1, command='cd /workspace'
-        )
-    ]
+    mock_history = [create_cmd_output(exit_code=0, content='', command='cd /workspace')]
    mock_llm_config = LLMConfig(model='test_model', api_key='test_api_key')

    mock_completion_response = MagicMock()
--- a/tests/unit/test_action_serialization.py
+++ b/tests/unit/test_action_serialization.py
@ -41,11 +41,10 @@ def serialization_deserialization(
    serialized_action_memory = event_to_memory(action_instance, max_message_chars)
    original_memory_dict = original_action_dict.copy()

-    # we don't send backend properties like id or 'keep_prompt'
+    # we don't send backend properties like id
    original_memory_dict.pop('id', None)
    original_memory_dict.pop('timestamp', None)
    if 'args' in original_memory_dict:
-        original_memory_dict['args'].pop('keep_prompt', None)
        original_memory_dict['args'].pop('blocking', None)
        original_memory_dict['args'].pop('confirmation_state', None)

@ -99,7 +98,6 @@ def test_cmd_run_action_serialization_deserialization():
            'blocking': False,
            'command': 'echo "Hello world"',
            'thought': '',
-            'keep_prompt': True,
            'hidden': False,
            'confirmation_state': ActionConfirmationStatus.CONFIRMED,
        },
@ -154,3 +152,32 @@ def test_file_write_action_serialization_deserialization():
        },
    }
    serialization_deserialization(original_action_dict, FileWriteAction)
+
+
+def test_legacy_serialization():
+    original_action_dict = {
+        'action': 'run',
+        'args': {
+            'blocking': False,
+            'command': 'echo "Hello world"',
+            'thought': '',
+            'hidden': False,
+            'confirmation_state': ActionConfirmationStatus.CONFIRMED,
+            'keep_prompt': False,  # will be treated as no-op
+        },
+    }
+    event = event_from_dict(original_action_dict)
+    assert isinstance(event, Action)
+    assert isinstance(event, CmdRunAction)
+    assert event.command == 'echo "Hello world"'
+    assert event.hidden is False
+    assert not hasattr(event, 'keep_prompt')
+
+    event_dict = event_to_dict(event)
+    assert 'keep_prompt' not in event_dict['args']
+    assert (
+        event_dict['args']['confirmation_state'] == ActionConfirmationStatus.CONFIRMED
+    )
+    assert event_dict['args']['blocking'] is False
+    assert event_dict['args']['command'] == 'echo "Hello world"'
+    assert event_dict['args']['thought'] == ''
--- a/tests/unit/test_bash_parsing.py
+++ b/tests/unit/test_bash_parsing.py
@ -1,6 +1,6 @@
 import pytest

-from openhands.runtime.utils.bash import split_bash_commands
+from openhands.runtime.utils.bash import escape_bash_special_chars, split_bash_commands


 def test_split_commands_util():
@ -257,3 +257,172 @@ def test_split_commands_with_invalid_input():
    for input_command in invalid_inputs:
        # it will fall back to return the original input
        assert split_bash_commands(input_command) == [input_command]
+
+
+def test_escape_bash_special_chars():
+    test_cases = [
+        # Basic cases - use raw strings (r'') to avoid Python escape sequence warnings
+        ('echo test \\; ls', 'echo test \\\\; ls'),
+        ('grep pattern \\| sort', 'grep pattern \\\\| sort'),
+        ('cmd1 \\&\\& cmd2', 'cmd1 \\\\&\\\\& cmd2'),
+        ('cat file \\> output.txt', 'cat file \\\\> output.txt'),
+        ('cat \\< input.txt', 'cat \\\\< input.txt'),
+        # Quoted strings should remain unchanged
+        ('echo "test \\; unchanged"', 'echo "test \\; unchanged"'),
+        ("echo 'test \\| unchanged'", "echo 'test \\| unchanged'"),
+        # Mixed quoted and unquoted
+        (
+            'echo "quoted \\;" \\; "more" \\| grep',
+            'echo "quoted \\;" \\\\; "more" \\\\| grep',
+        ),
+        # Multiple escapes in sequence
+        ('cmd1 \\;\\|\\& cmd2', 'cmd1 \\\\;\\\\|\\\\& cmd2'),
+        # Commands with other backslashes
+        ('echo test\\ntest', 'echo test\\ntest'),
+        ('echo "test\\ntest"', 'echo "test\\ntest"'),
+        # Edge cases
+        ('', ''),  # Empty string
+        ('\\\\', '\\\\'),  # Double backslash
+        ('\\"', '\\"'),  # Escaped quote
+    ]
+
+    for input_cmd, expected in test_cases:
+        result = escape_bash_special_chars(input_cmd)
+        assert (
+            result == expected
+        ), f'Failed on input "{input_cmd}"\nExpected: "{expected}"\nGot: "{result}"'
+
+
+def test_escape_bash_special_chars_with_invalid_syntax():
+    invalid_inputs = [
+        'echo "unclosed quote',
+        "echo 'unclosed quote",
+        'cat <<EOF\nunclosed heredoc',
+    ]
+    for input_cmd in invalid_inputs:
+        # Should return original input when parsing fails
+        result = escape_bash_special_chars(input_cmd)
+        assert result == input_cmd, f'Failed to handle invalid input: {input_cmd}'
+
+
+def test_escape_bash_special_chars_with_heredoc():
+    input_cmd = r"""cat <<EOF
+line1 \; not escaped
+line2 \| not escaped
+EOF"""
+    # Heredoc content should not be escaped
+    expected = input_cmd
+    result = escape_bash_special_chars(input_cmd)
+    assert (
+        result == expected
+    ), f'Failed to handle heredoc correctly\nExpected: {expected}\nGot: {result}'
+
+
+def test_escape_bash_special_chars_with_parameter_expansion():
+    test_cases = [
+        # Parameter expansion should be preserved
+        ('echo $HOME', 'echo $HOME'),
+        ('echo ${HOME}', 'echo ${HOME}'),
+        ('echo ${HOME:-default}', 'echo ${HOME:-default}'),
+        # Mixed with special chars
+        ('echo $HOME \\; ls', 'echo $HOME \\\\; ls'),
+        ('echo ${PATH} \\| grep bin', 'echo ${PATH} \\\\| grep bin'),
+        # Quoted parameter expansion
+        ('echo "$HOME"', 'echo "$HOME"'),
+        ('echo "${HOME}"', 'echo "${HOME}"'),
+        # Complex parameter expansions
+        ('echo ${var:=default} \\; ls', 'echo ${var:=default} \\\\; ls'),
+        ('echo ${!prefix*} \\| sort', 'echo ${!prefix*} \\\\| sort'),
+    ]
+
+    for input_cmd, expected in test_cases:
+        result = escape_bash_special_chars(input_cmd)
+        assert (
+            result == expected
+        ), f'Failed on input "{input_cmd}"\nExpected: "{expected}"\nGot: "{result}"'
+
+
+def test_escape_bash_special_chars_with_command_substitution():
+    test_cases = [
+        # Basic command substitution
+        ('echo $(pwd)', 'echo $(pwd)'),
+        ('echo `pwd`', 'echo `pwd`'),
+        # Mixed with special chars
+        ('echo $(pwd) \\; ls', 'echo $(pwd) \\\\; ls'),
+        ('echo `pwd` \\| grep home', 'echo `pwd` \\\\| grep home'),
+        # Nested command substitution
+        ('echo $(echo `pwd`)', 'echo $(echo `pwd`)'),
+        # Complex command substitution
+        ('echo $(find . -name "*.txt" \\; ls)', 'echo $(find . -name "*.txt" \\; ls)'),
+        # Mixed with quotes
+        ('echo "$(pwd)"', 'echo "$(pwd)"'),
+        ('echo "`pwd`"', 'echo "`pwd`"'),
+    ]
+
+    for input_cmd, expected in test_cases:
+        result = escape_bash_special_chars(input_cmd)
+        assert (
+            result == expected
+        ), f'Failed on input "{input_cmd}"\nExpected: "{expected}"\nGot: "{result}"'
+
+
+def test_escape_bash_special_chars_mixed_nodes():
+    test_cases = [
+        # Mix of parameter expansion and command substitution
+        ('echo $HOME/$(pwd)', 'echo $HOME/$(pwd)'),
+        # Mix with special chars
+        ('echo $HOME/$(pwd) \\; ls', 'echo $HOME/$(pwd) \\\\; ls'),
+        # Complex mixed cases
+        (
+            'echo "${HOME}/$(basename `pwd`) \\; next"',
+            'echo "${HOME}/$(basename `pwd`) \\; next"',
+        ),
+        (
+            'VAR=${HOME} \\; echo $(pwd)',
+            'VAR=${HOME} \\\\; echo $(pwd)',
+        ),
+        # Real-world examples
+        (
+            'find . -name "*.txt" -exec grep "${PATTERN:-default}" {} \\;',
+            'find . -name "*.txt" -exec grep "${PATTERN:-default}" {} \\\\;',
+        ),
+        (
+            'echo "Current path: ${PWD}/$(basename `pwd`)" \\| grep home',
+            'echo "Current path: ${PWD}/$(basename `pwd`)" \\\\| grep home',
+        ),
+    ]
+
+    for input_cmd, expected in test_cases:
+        result = escape_bash_special_chars(input_cmd)
+        assert (
+            result == expected
+        ), f'Failed on input "{input_cmd}"\nExpected: "{expected}"\nGot: "{result}"'
+
+
+def test_escape_bash_special_chars_with_chained_commands():
+    test_cases = [
+        # Basic chained commands
+        ('ls && pwd', 'ls && pwd'),
+        ('echo "hello" && ls', 'echo "hello" && ls'),
+        # Chained commands with special chars
+        ('ls \\; pwd && echo test', 'ls \\\\; pwd && echo test'),
+        ('echo test && grep pattern \\| sort', 'echo test && grep pattern \\\\| sort'),
+        # Complex chained cases
+        ('echo ${HOME} && ls \\; pwd', 'echo ${HOME} && ls \\\\; pwd'),
+        (
+            'echo "$(pwd)" && cat file \\> out.txt',
+            'echo "$(pwd)" && cat file \\\\> out.txt',
+        ),
+        # Multiple chains
+        ('cmd1 && cmd2 && cmd3', 'cmd1 && cmd2 && cmd3'),
+        (
+            'cmd1 \\; ls && cmd2 \\| grep && cmd3',
+            'cmd1 \\\\; ls && cmd2 \\\\| grep && cmd3',
+        ),
+    ]
+
+    for input_cmd, expected in test_cases:
+        result = escape_bash_special_chars(input_cmd)
+        assert (
+            result == expected
+        ), f'Failed on input "{input_cmd}"\nExpected: "{expected}"\nGot: "{result}"'
--- a/tests/unit/test_bash_ps1_metadata.py
+++ b/tests/unit/test_bash_ps1_metadata.py
@ -0,0 +1,339 @@
+import json
+
+from openhands.events.observation.commands import (
+    CMD_OUTPUT_METADATA_PS1_REGEX,
+    CMD_OUTPUT_PS1_BEGIN,
+    CMD_OUTPUT_PS1_END,
+    CmdOutputMetadata,
+    CmdOutputObservation,
+)
+
+
+def test_ps1_metadata_format():
+    """Test that PS1 prompt has correct format markers"""
+    prompt = CmdOutputMetadata.to_ps1_prompt()
+    print(prompt)
+    assert prompt.startswith('\n###PS1JSON###\n')
+    assert prompt.endswith('\n###PS1END###\n')
+    assert r'\"exit_code\"' in prompt, 'PS1 prompt should contain escaped double quotes'
+
+
+def test_ps1_metadata_json_structure():
+    """Test that PS1 prompt contains valid JSON with expected fields"""
+    prompt = CmdOutputMetadata.to_ps1_prompt()
+    # Extract JSON content between markers
+    json_str = prompt.replace('###PS1JSON###\n', '').replace('\n###PS1END###\n', '')
+    # Remove escaping before parsing
+    json_str = json_str.replace(r'\"', '"')
+    # Remove any trailing content after the JSON
+    json_str = json_str.split('###PS1END###')[0].strip()
+    data = json.loads(json_str)
+
+    # Check required fields
+    expected_fields = {
+        'pid',
+        'exit_code',
+        'username',
+        'hostname',
+        'working_dir',
+        'py_interpreter_path',
+    }
+    assert set(data.keys()) == expected_fields
+
+
+def test_ps1_metadata_parsing():
+    """Test parsing PS1 output into CmdOutputMetadata"""
+    test_data = {
+        'exit_code': 0,
+        'username': 'testuser',
+        'hostname': 'localhost',
+        'working_dir': '/home/testuser',
+        'py_interpreter_path': '/usr/bin/python',
+    }
+
+    ps1_str = f"""###PS1JSON###
+{json.dumps(test_data, indent=2)}
+###PS1END###
+"""
+    matches = CmdOutputMetadata.matches_ps1_metadata(ps1_str)
+    assert len(matches) == 1
+    metadata = CmdOutputMetadata.from_ps1_match(matches[0])
+    assert metadata.exit_code == test_data['exit_code']
+    assert metadata.username == test_data['username']
+    assert metadata.hostname == test_data['hostname']
+    assert metadata.working_dir == test_data['working_dir']
+    assert metadata.py_interpreter_path == test_data['py_interpreter_path']
+
+
+def test_ps1_metadata_parsing_string():
+    """Test parsing PS1 output into CmdOutputMetadata"""
+    ps1_str = r"""###PS1JSON###
+{
+  "exit_code": "0",
+  "username": "myname",
+  "hostname": "myhostname",
+  "working_dir": "~/mydir",
+  "py_interpreter_path": "/my/python/path"
+}
+###PS1END###
+"""
+    matches = CmdOutputMetadata.matches_ps1_metadata(ps1_str)
+    assert len(matches) == 1
+    metadata = CmdOutputMetadata.from_ps1_match(matches[0])
+    assert metadata.exit_code == 0
+    assert metadata.username == 'myname'
+    assert metadata.hostname == 'myhostname'
+    assert metadata.working_dir == '~/mydir'
+    assert metadata.py_interpreter_path == '/my/python/path'
+
+
+def test_ps1_metadata_parsing_string_real_example():
+    """Test parsing PS1 output into CmdOutputMetadata"""
+    ps1_str = r"""
+###PS1JSON###
+{
+  "pid": "",
+  "exit_code": "0",
+  "username": "runner",
+  "hostname": "fv-az1055-610",
+  "working_dir": "/home/runner/work/OpenHands/OpenHands",
+  "py_interpreter_path": "/home/runner/.cache/pypoetry/virtualenvs/openhands-ai-ULPBlkAi-py3.12/bin/python"
+}
+###PS1END###
+"""
+    matches = CmdOutputMetadata.matches_ps1_metadata(ps1_str)
+    assert len(matches) == 1
+    metadata = CmdOutputMetadata.from_ps1_match(matches[0])
+    assert metadata.exit_code == 0
+    assert metadata.username == 'runner'
+    assert metadata.hostname == 'fv-az1055-610'
+    assert metadata.working_dir == '/home/runner/work/OpenHands/OpenHands'
+    assert (
+        metadata.py_interpreter_path
+        == '/home/runner/.cache/pypoetry/virtualenvs/openhands-ai-ULPBlkAi-py3.12/bin/python'
+    )
+
+
+def test_ps1_metadata_parsing_additional_prefix():
+    """Test parsing PS1 output into CmdOutputMetadata"""
+    test_data = {
+        'exit_code': 0,
+        'username': 'testuser',
+        'hostname': 'localhost',
+        'working_dir': '/home/testuser',
+        'py_interpreter_path': '/usr/bin/python',
+    }
+
+    ps1_str = f"""
+This is something that not part of the PS1 prompt
+
+###PS1JSON###
+{json.dumps(test_data, indent=2)}
+###PS1END###
+"""
+
+    matches = CmdOutputMetadata.matches_ps1_metadata(ps1_str)
+    assert len(matches) == 1
+    metadata = CmdOutputMetadata.from_ps1_match(matches[0])
+    assert metadata.exit_code == test_data['exit_code']
+    assert metadata.username == test_data['username']
+    assert metadata.hostname == test_data['hostname']
+    assert metadata.working_dir == test_data['working_dir']
+    assert metadata.py_interpreter_path == test_data['py_interpreter_path']
+
+
+def test_ps1_metadata_parsing_invalid():
+    """Test parsing invalid PS1 output returns default metadata"""
+    # Test with invalid JSON
+    invalid_json = """###PS1JSON###
+    {invalid json}
+###PS1END###
+"""
+    matches = CmdOutputMetadata.matches_ps1_metadata(invalid_json)
+    assert len(matches) == 0  # No matches should be found for invalid JSON
+
+    # Test with missing markers
+    invalid_format = """NOT A VALID PS1 PROMPT"""
+    matches = CmdOutputMetadata.matches_ps1_metadata(invalid_format)
+    assert len(matches) == 0
+
+    # Test with empty PS1 metadata
+    empty_metadata = """###PS1JSON###
+
+###PS1END###
+"""
+    matches = CmdOutputMetadata.matches_ps1_metadata(empty_metadata)
+    assert len(matches) == 0  # No matches should be found for empty metadata
+
+    # Test with whitespace in PS1 metadata
+    whitespace_metadata = """###PS1JSON###
+
+    {
+        "exit_code": "0",
+        "pid": "123",
+        "username": "test",
+        "hostname": "localhost",
+        "working_dir": "/home/test",
+        "py_interpreter_path": "/usr/bin/python"
+    }
+
+###PS1END###
+"""
+    matches = CmdOutputMetadata.matches_ps1_metadata(whitespace_metadata)
+    assert len(matches) == 1
+    metadata = CmdOutputMetadata.from_ps1_match(matches[0])
+    assert metadata.exit_code == 0
+    assert metadata.pid == 123
+
+
+def test_ps1_metadata_missing_fields():
+    """Test handling of missing fields in PS1 metadata"""
+    # Test with only required fields
+    minimal_data = {'exit_code': 0, 'pid': 123}
+    ps1_str = f"""###PS1JSON###
+{json.dumps(minimal_data)}
+###PS1END###
+"""
+    matches = CmdOutputMetadata.matches_ps1_metadata(ps1_str)
+    assert len(matches) == 1
+    metadata = CmdOutputMetadata.from_ps1_match(matches[0])
+    assert metadata.exit_code == 0
+    assert metadata.pid == 123
+    assert metadata.username is None
+    assert metadata.hostname is None
+    assert metadata.working_dir is None
+    assert metadata.py_interpreter_path is None
+
+    # Test with missing exit_code but valid pid
+    no_exit_code = {'pid': 123, 'username': 'test'}
+    ps1_str = f"""###PS1JSON###
+{json.dumps(no_exit_code)}
+###PS1END###
+"""
+    matches = CmdOutputMetadata.matches_ps1_metadata(ps1_str)
+    assert len(matches) == 1
+    metadata = CmdOutputMetadata.from_ps1_match(matches[0])
+    assert metadata.exit_code == -1  # default value
+    assert metadata.pid == 123
+    assert metadata.username == 'test'
+
+
+def test_ps1_metadata_multiple_blocks():
+    """Test handling multiple PS1 metadata blocks"""
+    test_data = {
+        'exit_code': 0,
+        'username': 'testuser',
+        'hostname': 'localhost',
+        'working_dir': '/home/testuser',
+        'py_interpreter_path': '/usr/bin/python',
+    }
+
+    ps1_str = f"""###PS1JSON###
+{json.dumps(test_data, indent=2)}
+###PS1END###
+Some other content
+###PS1JSON###
+{json.dumps(test_data, indent=2)}
+###PS1END###
+"""
+    matches = CmdOutputMetadata.matches_ps1_metadata(ps1_str)
+    assert len(matches) == 2  # Should find both blocks
+    # Both blocks should parse successfully
+    metadata1 = CmdOutputMetadata.from_ps1_match(matches[0])
+    metadata2 = CmdOutputMetadata.from_ps1_match(matches[1])
+    assert metadata1.exit_code == test_data['exit_code']
+    assert metadata2.exit_code == test_data['exit_code']
+
+
+def test_ps1_metadata_regex_pattern():
+    """Test the regex pattern used to extract PS1 metadata"""
+    # Test basic pattern matching
+    test_str = f'{CMD_OUTPUT_PS1_BEGIN}test\n{CMD_OUTPUT_PS1_END}'
+    matches = CMD_OUTPUT_METADATA_PS1_REGEX.finditer(test_str)
+    match = next(matches)
+    assert match.group(1).strip() == 'test'
+
+    # Test with content before and after
+    test_str = f'prefix\n{CMD_OUTPUT_PS1_BEGIN}test\n{CMD_OUTPUT_PS1_END}suffix'
+    matches = CMD_OUTPUT_METADATA_PS1_REGEX.finditer(test_str)
+    match = next(matches)
+    assert match.group(1).strip() == 'test'
+
+    # Test with multiline content
+    test_str = f'{CMD_OUTPUT_PS1_BEGIN}line1\nline2\nline3\n{CMD_OUTPUT_PS1_END}'
+    matches = CMD_OUTPUT_METADATA_PS1_REGEX.finditer(test_str)
+    match = next(matches)
+    assert match.group(1).strip() == 'line1\nline2\nline3'
+
+
+def test_cmd_output_observation_properties():
+    """Test CmdOutputObservation class properties"""
+    # Test with successful command
+    metadata = CmdOutputMetadata(exit_code=0, pid=123)
+    obs = CmdOutputObservation(command='ls', content='file1\nfile2', metadata=metadata)
+    assert obs.command_id == 123
+    assert obs.exit_code == 0
+    assert not obs.error
+    assert 'exit code 0' in obs.message
+    assert 'ls' in obs.message
+    assert 'file1' in str(obs)
+    assert 'file2' in str(obs)
+    assert 'metadata' in str(obs)
+
+    # Test with failed command
+    metadata = CmdOutputMetadata(exit_code=1, pid=456)
+    obs = CmdOutputObservation(command='invalid', content='error', metadata=metadata)
+    assert obs.command_id == 456
+    assert obs.exit_code == 1
+    assert obs.error
+    assert 'exit code 1' in obs.message
+    assert 'invalid' in obs.message
+    assert 'error' in str(obs)
+
+
+def test_ps1_metadata_empty_fields():
+    """Test handling of empty fields in PS1 metadata"""
+    # Test with empty strings
+    empty_data = {
+        'exit_code': 0,
+        'pid': 123,
+        'username': '',
+        'hostname': '',
+        'working_dir': '',
+        'py_interpreter_path': '',
+    }
+    ps1_str = f"""###PS1JSON###
+{json.dumps(empty_data)}
+###PS1END###
+"""
+    matches = CmdOutputMetadata.matches_ps1_metadata(ps1_str)
+    assert len(matches) == 1
+    metadata = CmdOutputMetadata.from_ps1_match(matches[0])
+    assert metadata.exit_code == 0
+    assert metadata.pid == 123
+    assert metadata.username == ''
+    assert metadata.hostname == ''
+    assert metadata.working_dir == ''
+    assert metadata.py_interpreter_path == ''
+
+    # Test with malformed but valid JSON
+    malformed_json = """###PS1JSON###
+    {
+        "exit_code":0,
+        "pid"  :  123,
+        "username":    "test"  ,
+        "hostname": "host",
+        "working_dir"    :"dir",
+        "py_interpreter_path":"path"
+    }
+###PS1END###
+"""
+    matches = CmdOutputMetadata.matches_ps1_metadata(malformed_json)
+    assert len(matches) == 1
+    metadata = CmdOutputMetadata.from_ps1_match(matches[0])
+    assert metadata.exit_code == 0
+    assert metadata.pid == 123
+    assert metadata.username == 'test'
+    assert metadata.hostname == 'host'
+    assert metadata.working_dir == 'dir'
+    assert metadata.py_interpreter_path == 'path'
--- a/tests/unit/test_bash_session.py
+++ b/tests/unit/test_bash_session.py
@ -0,0 +1,384 @@
+import os
+import tempfile
+
+from openhands.core.logger import openhands_logger as logger
+from openhands.events.action import CmdRunAction
+from openhands.runtime.utils.bash import BashCommandStatus, BashSession
+
+
+def test_session_initialization():
+    # Test with custom working directory
+    with tempfile.TemporaryDirectory() as temp_dir:
+        session = BashSession(work_dir=temp_dir)
+        session.initialize()
+        obs = session.execute(CmdRunAction('pwd'))
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert temp_dir in obs.content
+        assert '[The command completed with exit code 0.]' in obs.metadata.suffix
+        session.close()
+
+    # Test with custom username
+    session = BashSession(work_dir=os.getcwd(), username='nobody')
+    session.initialize()
+    assert 'openhands-nobody' in session.session.name
+    session.close()
+
+
+def test_cwd_property(tmp_path):
+    session = BashSession(work_dir=tmp_path)
+    session.initialize()
+    # Change directory and verify pwd updates
+    random_dir = tmp_path / 'random'
+    random_dir.mkdir()
+    session.execute(CmdRunAction(f'cd {random_dir}'))
+    assert session.cwd == str(random_dir)
+    session.close()
+
+
+def test_basic_command():
+    session = BashSession(work_dir=os.getcwd())
+    session.initialize()
+
+    # Test simple command
+    obs = session.execute(CmdRunAction("echo 'hello world'"))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert 'hello world' in obs.content
+    assert obs.metadata.suffix == '\n[The command completed with exit code 0.]'
+    assert obs.metadata.prefix == ''
+    assert obs.metadata.exit_code == 0
+    assert session.prev_status == BashCommandStatus.COMPLETED
+
+    # Test command with error
+    obs = session.execute(CmdRunAction('nonexistent_command'))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.metadata.exit_code == 127
+    assert 'nonexistent_command: command not found' in obs.content
+    assert obs.metadata.suffix == '\n[The command completed with exit code 127.]'
+    assert obs.metadata.prefix == ''
+    assert session.prev_status == BashCommandStatus.COMPLETED
+
+    # Test multiple commands in sequence
+    obs = session.execute(CmdRunAction('echo "first" && echo "second" && echo "third"'))
+    assert 'first' in obs.content
+    assert 'second' in obs.content
+    assert 'third' in obs.content
+    assert obs.metadata.suffix == '\n[The command completed with exit code 0.]'
+    assert obs.metadata.prefix == ''
+    assert obs.metadata.exit_code == 0
+    assert session.prev_status == BashCommandStatus.COMPLETED
+
+    session.close()
+
+
+def test_long_running_command_follow_by_execute():
+    session = BashSession(work_dir=os.getcwd(), no_change_timeout_seconds=2)
+    session.initialize()
+
+    # Test command that produces output slowly
+    obs = session.execute(
+        CmdRunAction('for i in {1..3}; do echo $i; sleep 3; done', blocking=False)
+    )
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert '1' in obs.content  # First number should appear before timeout
+    assert obs.metadata.exit_code == -1  # -1 indicates command is still running
+    assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT
+    assert obs.metadata.suffix == (
+        '\n[The command has no new output after 2 seconds. '
+        "You may wait longer to see additional output by sending empty command '', "
+        'send other commands to interact with the current process, '
+        'or send keys to interrupt/kill the command.]'
+    )
+    assert obs.metadata.prefix == ''
+
+    # Continue watching output
+    obs = session.execute(CmdRunAction(''))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert '2' in obs.content
+    assert obs.metadata.prefix == '[Command output continued from previous command]\n'
+    assert obs.metadata.suffix == (
+        '\n[The command has no new output after 2 seconds. '
+        "You may wait longer to see additional output by sending empty command '', "
+        'send other commands to interact with the current process, '
+        'or send keys to interrupt/kill the command.]'
+    )
+    assert obs.metadata.exit_code == -1  # -1 indicates command is still running
+    assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT
+
+    # Test command that produces no output
+    obs = session.execute(CmdRunAction('sleep 15'))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert '3' in obs.content
+    assert obs.metadata.prefix == '[Command output continued from previous command]\n'
+    assert obs.metadata.suffix == (
+        '\n[The command has no new output after 2 seconds. '
+        "You may wait longer to see additional output by sending empty command '', "
+        'send other commands to interact with the current process, '
+        'or send keys to interrupt/kill the command.]'
+    )
+    assert obs.metadata.exit_code == -1  # -1 indicates command is still running
+    assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT
+
+    session.close()
+
+
+def test_interactive_command():
+    session = BashSession(work_dir=os.getcwd(), no_change_timeout_seconds=3)
+    session.initialize()
+
+    # Test interactive command with blocking=True
+    obs = session.execute(
+        CmdRunAction(
+            'read -p \'Enter name: \' name && echo "Hello $name"',
+        )
+    )
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert 'Enter name:' in obs.content
+    assert obs.metadata.exit_code == -1  # -1 indicates command is still running
+    assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT
+    assert obs.metadata.suffix == (
+        '\n[The command has no new output after 3 seconds. '
+        "You may wait longer to see additional output by sending empty command '', "
+        'send other commands to interact with the current process, '
+        'or send keys to interrupt/kill the command.]'
+    )
+    assert obs.metadata.prefix == ''
+
+    # Send input
+    obs = session.execute(CmdRunAction('John'))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert 'Hello John' in obs.content
+    assert obs.metadata.exit_code == 0
+    assert obs.metadata.suffix == '\n[The command completed with exit code 0.]'
+    assert obs.metadata.prefix == ''
+    assert session.prev_status == BashCommandStatus.COMPLETED
+
+    # Test multiline command input
+    obs = session.execute(CmdRunAction('cat << EOF'))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.metadata.exit_code == -1
+    assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT
+    assert obs.metadata.suffix == (
+        '\n[The command has no new output after 3 seconds. '
+        "You may wait longer to see additional output by sending empty command '', "
+        'send other commands to interact with the current process, '
+        'or send keys to interrupt/kill the command.]'
+    )
+    assert obs.metadata.prefix == ''
+
+    obs = session.execute(CmdRunAction('line 1'))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.metadata.exit_code == -1
+    assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT
+    assert obs.metadata.suffix == (
+        '\n[The command has no new output after 3 seconds. '
+        "You may wait longer to see additional output by sending empty command '', "
+        'send other commands to interact with the current process, '
+        'or send keys to interrupt/kill the command.]'
+    )
+    assert obs.metadata.prefix == '[Command output continued from previous command]\n'
+
+    obs = session.execute(CmdRunAction('line 2'))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.metadata.exit_code == -1
+    assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT
+    assert obs.metadata.suffix == (
+        '\n[The command has no new output after 3 seconds. '
+        "You may wait longer to see additional output by sending empty command '', "
+        'send other commands to interact with the current process, '
+        'or send keys to interrupt/kill the command.]'
+    )
+    assert obs.metadata.prefix == '[Command output continued from previous command]\n'
+
+    obs = session.execute(CmdRunAction('EOF'))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert 'line 1' in obs.content and 'line 2' in obs.content
+    assert obs.metadata.exit_code == 0
+    assert obs.metadata.suffix == '\n[The command completed with exit code 0.]'
+    assert obs.metadata.prefix == ''
+
+    session.close()
+
+
+def test_ctrl_c():
+    session = BashSession(work_dir=os.getcwd(), no_change_timeout_seconds=2)
+    session.initialize()
+
+    # Start infinite loop
+    obs = session.execute(
+        CmdRunAction("while true; do echo 'looping'; sleep 3; done"),
+    )
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert 'looping' in obs.content
+    assert obs.metadata.suffix == (
+        '\n[The command has no new output after 2 seconds. '
+        "You may wait longer to see additional output by sending empty command '', "
+        'send other commands to interact with the current process, '
+        'or send keys to interrupt/kill the command.]'
+    )
+    assert obs.metadata.prefix == ''
+    assert obs.metadata.exit_code == -1  # -1 indicates command is still running
+    assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT
+
+    # Send Ctrl+C
+    obs = session.execute(CmdRunAction('C-c'))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.metadata.exit_code == 130  # Standard exit code for Ctrl+C
+    assert (
+        obs.metadata.suffix
+        == '\n[The command completed with exit code 130. CTRL+C was sent.]'
+    )
+    assert obs.metadata.prefix == ''
+    assert session.prev_status == BashCommandStatus.COMPLETED
+
+    session.close()
+
+
+def test_empty_command_errors():
+    session = BashSession(work_dir=os.getcwd())
+    session.initialize()
+
+    # Test empty command without previous command
+    obs = session.execute(CmdRunAction(''))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert (
+        obs.content
+        == 'ERROR: No previous command to continue from. Previous command has to be timeout to be continued.'
+    )
+    assert obs.metadata.exit_code == -1
+    assert obs.metadata.prefix == ''
+    assert obs.metadata.suffix == ''
+    assert session.prev_status is None
+
+    session.close()
+
+
+def test_command_output_continuation():
+    session = BashSession(work_dir=os.getcwd(), no_change_timeout_seconds=2)
+    session.initialize()
+
+    # Start a command that produces output slowly
+    obs = session.execute(CmdRunAction('for i in {1..5}; do echo $i; sleep 3; done'))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.content.strip() == '1'
+    assert obs.metadata.prefix == ''
+    assert '[The command has no new output after 2 seconds.' in obs.metadata.suffix
+    assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT
+
+    obs = session.execute(CmdRunAction(''))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert '[Command output continued from previous command]' in obs.metadata.prefix
+    assert obs.content.strip() == '2'
+    assert '[The command has no new output after 2 seconds.' in obs.metadata.suffix
+    assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT
+
+    obs = session.execute(CmdRunAction(''))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert '[Command output continued from previous command]' in obs.metadata.prefix
+    assert obs.content.strip() == '3'
+
+    assert '[The command has no new output after 2 seconds.' in obs.metadata.suffix
+    assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT
+
+    obs = session.execute(CmdRunAction(''))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert '[Command output continued from previous command]' in obs.metadata.prefix
+    assert obs.content.strip() == '4'
+    assert '[The command has no new output after 2 seconds.' in obs.metadata.suffix
+    assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT
+
+    obs = session.execute(CmdRunAction(''))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert '[Command output continued from previous command]' in obs.metadata.prefix
+    assert obs.content.strip() == '5'
+    assert '[The command has no new output after 2 seconds.' in obs.metadata.suffix
+    assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT
+
+    obs = session.execute(CmdRunAction(''))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert '[The command completed with exit code 0.]' in obs.metadata.suffix
+    assert session.prev_status == BashCommandStatus.COMPLETED
+
+    session.close()
+
+
+def test_long_output():
+    session = BashSession(work_dir=os.getcwd())
+    session.initialize()
+
+    # Generate a long output that may exceed buffer size
+    obs = session.execute(CmdRunAction('for i in {1..5000}; do echo "Line $i"; done'))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert 'Line 1' in obs.content
+    assert 'Line 5000' in obs.content
+    assert obs.metadata.exit_code == 0
+    assert obs.metadata.prefix == ''
+    assert obs.metadata.suffix == '\n[The command completed with exit code 0.]'
+
+    session.close()
+
+
+def test_long_output_exceed_history_limit():
+    session = BashSession(work_dir=os.getcwd())
+    session.initialize()
+
+    # Generate a long output that may exceed buffer size
+    obs = session.execute(CmdRunAction('for i in {1..50000}; do echo "Line $i"; done'))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert 'Previous command outputs are truncated' in obs.metadata.prefix
+    assert 'Line 40000' in obs.content
+    assert 'Line 50000' in obs.content
+    assert obs.metadata.exit_code == 0
+    assert obs.metadata.suffix == '\n[The command completed with exit code 0.]'
+
+    session.close()
+
+
+def test_multiline_command():
+    session = BashSession(work_dir=os.getcwd())
+    session.initialize()
+
+    # Test multiline command with PS2 prompt disabled
+    obs = session.execute(
+        CmdRunAction("""if true; then
+echo "inside if"
+fi""")
+    )
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert 'inside if' in obs.content
+    assert obs.metadata.exit_code == 0
+    assert obs.metadata.prefix == ''
+    assert obs.metadata.suffix == '\n[The command completed with exit code 0.]'
+
+    session.close()
+
+
+def test_python_interactive_input():
+    session = BashSession(work_dir=os.getcwd(), no_change_timeout_seconds=2)
+    session.initialize()
+
+    # Test Python program that asks for input - properly escaped for bash
+    python_script = """name = input('Enter your name: '); age = input('Enter your age: '); print(f'Hello {name}, you are {age} years old')"""
+
+    # Start Python with the interactive script
+    obs = session.execute(CmdRunAction(f'python3 -c "{python_script}"'))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert 'Enter your name:' in obs.content
+    assert obs.metadata.exit_code == -1  # -1 indicates command is still running
+    assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT
+
+    # Send first input (name)
+    obs = session.execute(CmdRunAction('Alice'))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert 'Enter your age:' in obs.content
+    assert obs.metadata.exit_code == -1
+    assert session.prev_status == BashCommandStatus.NO_CHANGE_TIMEOUT
+
+    # Send second input (age)
+    obs = session.execute(CmdRunAction('25'))
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert 'Hello Alice, you are 25 years old' in obs.content
+    assert obs.metadata.exit_code == 0
+    assert obs.metadata.suffix == '\n[The command completed with exit code 0.]'
+    assert session.prev_status == BashCommandStatus.COMPLETED
+
+    session.close()
--- a/tests/unit/test_codeact_agent.py
+++ b/tests/unit/test_codeact_agent.py
@ -26,6 +26,7 @@ from openhands.events.action import (
 from openhands.events.event import EventSource, FileEditSource, FileReadSource
 from openhands.events.observation.browse import BrowserOutputObservation
 from openhands.events.observation.commands import (
+    CmdOutputMetadata,
    CmdOutputObservation,
    IPythonRunCellObservation,
 )
@ -50,7 +51,11 @@ def agent() -> CodeActAgent:
 def test_cmd_output_observation_message(agent: CodeActAgent):
    agent.config.function_calling = False
    obs = CmdOutputObservation(
-        command='echo hello', content='Command output', command_id=1, exit_code=0
+        command='echo hello',
+        content='Command output',
+        metadata=CmdOutputMetadata(
+            exit_code=0,
+        ),
    )

    results = agent.get_observation_message(obs, tool_call_id_to_message={})
@ -62,7 +67,7 @@ def test_cmd_output_observation_message(agent: CodeActAgent):
    assert len(result.content) == 1
    assert isinstance(result.content[0], TextContent)
    assert 'Command output' in result.content[0].text
-    assert 'Command finished with exit code 0' in result.content[0].text
+    assert '[Command finished with exit code 0]' in result.content[0].text


 def test_ipython_run_cell_observation_message(agent: CodeActAgent):
--- a/tests/unit/test_command_success.py
+++ b/tests/unit/test_command_success.py
@ -1,4 +1,5 @@
 from openhands.events.observation.commands import (
+    CmdOutputMetadata,
    CmdOutputObservation,
    IPythonRunCellObservation,
 )
@ -7,14 +8,18 @@ from openhands.events.observation.commands import (
 def test_cmd_output_success():
    # Test successful command
    obs = CmdOutputObservation(
-        command_id=1, command='ls', content='file1.txt\nfile2.txt', exit_code=0
+        command='ls',
+        content='file1.txt\nfile2.txt',
+        metadata=CmdOutputMetadata(exit_code=0),
    )
    assert obs.success is True
    assert obs.error is False

    # Test failed command
    obs = CmdOutputObservation(
-        command_id=2, command='ls', content='No such file or directory', exit_code=1
+        command='ls',
+        content='No such file or directory',
+        metadata=CmdOutputMetadata(exit_code=1),
    )
    assert obs.success is False
    assert obs.error is True
--- a/tests/unit/test_event_serialization.py
+++ b/tests/unit/test_event_serialization.py
@ -1,18 +1,22 @@
-from openhands.events.observation import CmdOutputObservation
+from openhands.events.observation import CmdOutputMetadata, CmdOutputObservation
 from openhands.events.serialization import event_to_dict


 def test_command_output_success_serialization():
    # Test successful command
    obs = CmdOutputObservation(
-        command_id=1, command='ls', content='file1.txt\nfile2.txt', exit_code=0
+        command='ls',
+        content='file1.txt\nfile2.txt',
+        metadata=CmdOutputMetadata(exit_code=0),
    )
    serialized = event_to_dict(obs)
    assert serialized['success'] is True

    # Test failed command
    obs = CmdOutputObservation(
-        command_id=2, command='ls', content='No such file or directory', exit_code=1
+        command='ls',
+        content='No such file or directory',
+        metadata=CmdOutputMetadata(exit_code=1),
    )
    serialized = event_to_dict(obs)
    assert serialized['success'] is False
--- a/tests/unit/test_is_stuck.py
+++ b/tests/unit/test_is_stuck.py
@ -107,7 +107,7 @@ class TestStuckDetector:
        cmd_action = CmdRunAction(command='ls')
        state.history.append(cmd_action)
        cmd_observation = CmdOutputObservation(
-            command_id=1, command='ls', content='file1.txt\nfile2.txt'
+            command='ls', content='file1.txt\nfile2.txt'
        )
        # cmd_observation._cause = cmd_action._id
        state.history.append(cmd_observation)
@ -188,7 +188,7 @@ class TestStuckDetector:
        cmd_action_1 = CmdRunAction(command='ls')
        cmd_action_1._id = 1
        state.history.append(cmd_action_1)
-        cmd_observation_1 = CmdOutputObservation(content='', command='ls', command_id=1)
+        cmd_observation_1 = CmdOutputObservation(content='', command='ls')
        cmd_observation_1._cause = cmd_action_1._id
        state.history.append(cmd_observation_1)
        # 4 events
@ -196,7 +196,7 @@ class TestStuckDetector:
        cmd_action_2 = CmdRunAction(command='ls')
        cmd_action_2._id = 2
        state.history.append(cmd_action_2)
-        cmd_observation_2 = CmdOutputObservation(content='', command='ls', command_id=2)
+        cmd_observation_2 = CmdOutputObservation(content='', command='ls')
        cmd_observation_2._cause = cmd_action_2._id
        state.history.append(cmd_observation_2)
        # 6 events
@ -212,7 +212,7 @@ class TestStuckDetector:
        cmd_action_3 = CmdRunAction(command='ls')
        cmd_action_3._id = 3
        state.history.append(cmd_action_3)
-        cmd_observation_3 = CmdOutputObservation(content='', command='ls', command_id=3)
+        cmd_observation_3 = CmdOutputObservation(content='', command='ls')
        cmd_observation_3._cause = cmd_action_3._id
        state.history.append(cmd_observation_3)
        # 10 events
@ -223,7 +223,7 @@ class TestStuckDetector:
        cmd_action_4 = CmdRunAction(command='ls')
        cmd_action_4._id = 4
        state.history.append(cmd_action_4)
-        cmd_observation_4 = CmdOutputObservation(content='', command='ls', command_id=4)
+        cmd_observation_4 = CmdOutputObservation(content='', command='ls')
        cmd_observation_4._cause = cmd_action_4._id
        state.history.append(cmd_observation_4)
        # 12 events
@ -436,7 +436,7 @@ class TestStuckDetector:
        cmd_action_1 = CmdRunAction(command='ls')
        state.history.append(cmd_action_1)
        cmd_observation_1 = CmdOutputObservation(
-            command_id=1, command='ls', content='file1.txt\nfile2.txt'
+            command='ls', content='file1.txt\nfile2.txt'
        )
        # cmd_observation_1._cause = cmd_action_1._id
        state.history.append(cmd_observation_1)
@ -452,7 +452,7 @@ class TestStuckDetector:
        cmd_action_2 = CmdRunAction(command='ls')
        state.history.append(cmd_action_2)
        cmd_observation_2 = CmdOutputObservation(
-            command_id=2, command='ls', content='file1.txt\nfile2.txt'
+            command='ls', content='file1.txt\nfile2.txt'
        )
        # cmd_observation_2._cause = cmd_action_2._id
        state.history.append(cmd_observation_2)
@ -475,7 +475,7 @@ class TestStuckDetector:
        cmd_action_3 = CmdRunAction(command='ls')
        state.history.append(cmd_action_3)
        cmd_observation_3 = CmdOutputObservation(
-            command_id=3, command='ls', content='file1.txt\nfile2.txt'
+            command='ls', content='file1.txt\nfile2.txt'
        )
        # cmd_observation_3._cause = cmd_action_3._id
        state.history.append(cmd_observation_3)
@ -506,7 +506,7 @@ class TestStuckDetector:
        cmd_action_1 = CmdRunAction(command='ls')
        state.history.append(cmd_action_1)
        cmd_observation_1 = CmdOutputObservation(
-            command_id=cmd_action_1.id, command='ls', content='file1.txt\nfile2.txt'
+            command='ls', content='file1.txt\nfile2.txt'
        )
        # cmd_observation_1._cause = cmd_action_1._id
        state.history.append(cmd_observation_1)
@ -521,9 +521,7 @@ class TestStuckDetector:

        cmd_action_2 = CmdRunAction(command='pwd')
        state.history.append(cmd_action_2)
-        cmd_observation_2 = CmdOutputObservation(
-            command_id=2, command='pwd', content='/home/user'
-        )
+        cmd_observation_2 = CmdOutputObservation(command='pwd', content='/home/user')
        # cmd_observation_2._cause = cmd_action_2._id
        state.history.append(cmd_observation_2)

@ -541,9 +539,7 @@ class TestStuckDetector:

        cmd_action_3 = CmdRunAction(command='pwd')
        state.history.append(cmd_action_3)
-        cmd_observation_3 = CmdOutputObservation(
-            command_id=cmd_action_3.id, command='pwd', content='/home/user'
-        )
+        cmd_observation_3 = CmdOutputObservation(command='pwd', content='/home/user')
        # cmd_observation_3._cause = cmd_action_3._id
        state.history.append(cmd_observation_3)

@ -590,7 +586,6 @@ class TestStuckDetector:
        # Add an observation event between the repeated message actions
        cmd_output_observation = CmdOutputObservation(
            content='OK, I was stuck, but no more.',
-            command_id=42,
            command='storybook',
            exit_code=0,
        )
--- a/tests/unit/test_observation_serialization.py
+++ b/tests/unit/test_observation_serialization.py
@ -1,4 +1,5 @@
 from openhands.events.observation import (
+    CmdOutputMetadata,
    CmdOutputObservation,
    Observation,
 )
@ -40,13 +41,65 @@ def serialization_deserialization(


 # Additional tests for various observation subclasses can be included here
+def test_observation_event_props_serialization_deserialization():
+    original_observation_dict = {
+        'id': 42,
+        'source': 'agent',
+        'timestamp': '2021-08-01T12:00:00',
+        'observation': 'run',
+        'message': 'Command `ls -l` executed with exit code 0.',
+        'extras': {
+            'command': 'ls -l',
+            'hidden': False,
+            'metadata': {
+                'exit_code': 0,
+                'hostname': None,
+                'pid': -1,
+                'prefix': '',
+                'py_interpreter_path': None,
+                'suffix': '',
+                'username': None,
+                'working_dir': None,
+            },
+        },
+        'content': 'foo.txt',
+        'success': True,
+    }
+    serialization_deserialization(original_observation_dict, CmdOutputObservation)
+
+
+def test_command_output_observation_serialization_deserialization():
+    original_observation_dict = {
+        'observation': 'run',
+        'extras': {
+            'command': 'ls -l',
+            'hidden': False,
+            'metadata': {
+                'exit_code': 0,
+                'hostname': None,
+                'pid': -1,
+                'prefix': '',
+                'py_interpreter_path': None,
+                'suffix': '',
+                'username': None,
+                'working_dir': None,
+            },
+        },
+        'message': 'Command `ls -l` executed with exit code 0.',
+        'content': 'foo.txt',
+        'success': True,
+    }
+    serialization_deserialization(original_observation_dict, CmdOutputObservation)
+
+
 def test_success_field_serialization():
    # Test success=True
    obs = CmdOutputObservation(
        content='Command succeeded',
-        exit_code=0,
        command='ls -l',
-        command_id=3,
+        metadata=CmdOutputMetadata(
+            exit_code=0,
+        ),
    )
    serialized = event_to_dict(obs)
    assert serialized['success'] is True
@ -54,9 +107,42 @@ def test_success_field_serialization():
    # Test success=False
    obs = CmdOutputObservation(
        content='No such file or directory',
-        exit_code=1,
        command='ls -l',
-        command_id=3,
+        metadata=CmdOutputMetadata(
+            exit_code=1,
+        ),
    )
    serialized = event_to_dict(obs)
    assert serialized['success'] is False
+
+
+def test_legacy_serialization():
+    original_observation_dict = {
+        'id': 42,
+        'source': 'agent',
+        'timestamp': '2021-08-01T12:00:00',
+        'observation': 'run',
+        'message': 'Command `ls -l` executed with exit code 0.',
+        'extras': {
+            'command': 'ls -l',
+            'hidden': False,
+            'exit_code': 0,
+            'command_id': 3,
+        },
+        'content': 'foo.txt',
+        'success': True,
+    }
+    event = event_from_dict(original_observation_dict)
+    assert isinstance(event, Observation)
+    assert isinstance(event, CmdOutputObservation)
+    assert event.metadata.exit_code == 0
+    assert event.success is True
+    assert event.command == 'ls -l'
+    assert event.hidden is False
+
+    event_dict = event_to_dict(event)
+    assert event_dict['success'] is True
+    assert event_dict['extras']['metadata']['exit_code'] == 0
+    assert event_dict['extras']['metadata']['pid'] == 3
+    assert event_dict['extras']['command'] == 'ls -l'
+    assert event_dict['extras']['hidden'] is False
--- a/tests/unit/test_security.py
+++ b/tests/unit/test_security.py
@ -368,7 +368,6 @@ async def test_unsafe_bash_command(temp_dir: str):
                            'blocking': False,
                            'command': 'ls',
                            'hidden': False,
-                            'keep_prompt': True,
                            'confirmation_state': ActionConfirmationStatus.CONFIRMED,
                        },
                    ),
@ -495,9 +494,7 @@ def test_parse_action(action, expected_trace):
            ],
        ),
        (
-            CmdOutputObservation(
-                content='cmd output content', command_id=1, command='ls'
-            ),
+            CmdOutputObservation(content='cmd output content', command='ls'),
            [
                ToolOutput(
                    metadata={},