OpenHands/tests/integration/regenerate.sh

#!/bin/bash
set -eo pipefail

run_test() {
  SANDBOX_TYPE=$SANDBOX_TYPE \
    WORKSPACE_BASE=$WORKSPACE_BASE \
    MAX_ITERATIONS=$MAX_ITERATIONS \
    WORKSPACE_MOUNT_PATH=$WORKSPACE_MOUNT_PATH \
    AGENT=$agent \
    poetry run pytest -s ./tests/integration/test_agent.py::$test_name
    # return exit code of pytest
    return $?
}

if [ -z $WORKSPACE_MOUNT_PATH ]; then
  WORKSPACE_MOUNT_PATH=$(pwd)
fi
if [ -z $WORKSPACE_BASE ]; then
  WORKSPACE_BASE=$(pwd)
fi

WORKSPACE_MOUNT_PATH+="/_test_workspace"
WORKSPACE_BASE+="/_test_workspace"

SANDBOX_TYPE="ssh"
MAX_ITERATIONS=10

agents=("MonologueAgent" "CodeActAgent" "PlannerAgent" "SWEAgent")
tasks=(
  "Fix typos in bad.txt."
  "Write a shell script 'hello.sh' that prints 'hello'."
  "Use Jupyter IPython to write a text file containing 'hello world' to '/workspace/test.txt'."
)
test_names=(
  "test_edits"
  "test_write_simple_script"
  "test_ipython"
)

num_of_tests=${#test_names[@]}
num_of_agents=${#agents[@]}

if [ "$num_of_tests" -ne "${#test_names[@]}" ]; then
  echo "Every task must correspond to one test case"
  exit 1
fi

rm -rf logs
rm -rf $WORKSPACE_BASE
for ((i = 0; i < num_of_tests; i++)); do
  task=${tasks[i]}
  test_name=${test_names[i]}

  # skip other tests if only one test is specified
  if [[ -n "$ONLY_TEST_NAME" && "$ONLY_TEST_NAME" != "$test_name" ]]; then
    continue
  fi

  for ((j = 0; j < num_of_agents; j++)); do
    agent=${agents[j]}

    # skip other agents if only one agent is specified
    if [[ -n "$ONLY_TEST_AGENT" && "$ONLY_TEST_AGENT" != "$agent" ]]; then
      continue
    fi

    echo -e "\n\n\n\n========Running $test_name for $agent========\n\n\n\n"
    rm -rf $WORKSPACE_BASE
    mkdir $WORKSPACE_BASE
    if [ -d "tests/integration/workspace/$test_name" ]; then
      cp -r tests/integration/workspace/$test_name/* $WORKSPACE_BASE
    fi

    if [ "$TEST_ONLY" = true ]; then
      set -e
    else
      # Temporarily disable 'exit on error'
      set +e
    fi

    run_test
    TEST_STATUS=$?
    # Re-enable 'exit on error'
    set -e

    if [[ $TEST_STATUS -ne 0 ]]; then
      echo -e "\n\n\n\n========$test_name failed, regenerating test data for $agent========\n\n\n\n"
      sleep 1

      rm -rf $WORKSPACE_BASE
      mkdir -p $WORKSPACE_BASE
      if [ -d "tests/integration/workspace/$test_name" ]; then
        cp -r tests/integration/workspace/$test_name/* $WORKSPACE_BASE
      fi

      rm -rf logs
      rm -rf tests/integration/mock/$agent/$test_name/*
      # set -x to print the command being executed
      set -x
      echo -e "/exit\n" | \
        SANDBOX_TYPE=$SANDBOX_TYPE \
        WORKSPACE_BASE=$WORKSPACE_BASE \
        DEBUG=true \
        WORKSPACE_MOUNT_PATH=$WORKSPACE_MOUNT_PATH AGENT=$agent \
        poetry run python ./opendevin/core/main.py \
        -i $MAX_ITERATIONS \
        -t "$task Do not ask me for confirmation at any point." \
        -c $agent
      set +x

      mkdir -p tests/integration/mock/$agent/$test_name/
      mv logs/llm/**/* tests/integration/mock/$agent/$test_name/

      echo -e "\n\n\n\n========$test_name test data regenerated for $agent, rerun test again to verify========\n\n\n\n"
      # Temporarily disable 'exit on error'
      set +e
      run_test
      TEST_STATUS=$?
      # Re-enable 'exit on error'
      set -e

      if [[ $TEST_STATUS -ne 0 ]]; then
        echo -e "\n\n\n\n========$test_name for $agent RERUN FAILED========\n\n\n\n"
        echo -e "There are multiple possibilities:"
        echo -e "  1. The agent is unable to finish the task within $MAX_ITERATIONS steps."
        echo -e "  2. The agent thinks itself has finished the task, but fails the validation in the test code."
        echo -e "  3. There is something non-deterministic in the prompt."
        echo -e "  4. There is a bug in this script, or in OpenDevin code."
        echo -e "NOTE: Some of the above problems could sometimes be fixed by a retry (with a more powerful LLM)."
        echo -e "      You could also consider improving the agent, increasing MAX_ITERATIONS, or skipping this test for this agent."
        exit 1
      else
        echo -e "\n\n\n\n========$test_name for $agent RERUN PASSED========\n\n\n\n"
        sleep 1
      fi


    else
      echo -e "\n\n\n\n========$test_name for $agent PASSED========\n\n\n\n"
      sleep 1
    fi
  done
done

rm -rf logs
rm -rf $WORKSPACE_BASE
echo "Done!"