OpenHands/tests/integration/regenerate.sh
Boxuan Li 6714000b2c
CodeActAgent: Fix iteration reminder (#1803)
This PR includes three changes:
1) Iteration reminder should start with MAX_ITERATIONS from config rather than default value 100
2) In the first prompt, we should tell the LLM it has `MAX_ITERATIONS - 1` turns left, rather than `MAX_ITERATIONS - 2`
3) Remove legacy ITERATION_REMINDER config
2024-05-15 13:48:47 +08:00

148 lines
4.3 KiB
Bash
Executable File

#!/bin/bash
set -eo pipefail
run_test() {
SANDBOX_TYPE=$SANDBOX_TYPE \
WORKSPACE_BASE=$WORKSPACE_BASE \
MAX_ITERATIONS=$MAX_ITERATIONS \
WORKSPACE_MOUNT_PATH=$WORKSPACE_MOUNT_PATH \
AGENT=$agent \
poetry run pytest -s ./tests/integration/test_agent.py::$test_name
# return exit code of pytest
return $?
}
if [ -z $WORKSPACE_MOUNT_PATH ]; then
WORKSPACE_MOUNT_PATH=$(pwd)
fi
if [ -z $WORKSPACE_BASE ]; then
WORKSPACE_BASE=$(pwd)
fi
WORKSPACE_MOUNT_PATH+="/_test_workspace"
WORKSPACE_BASE+="/_test_workspace"
SANDBOX_TYPE="ssh"
MAX_ITERATIONS=10
agents=("MonologueAgent" "CodeActAgent" "PlannerAgent" "SWEAgent")
tasks=(
"Fix typos in bad.txt."
"Write a shell script 'hello.sh' that prints 'hello'."
"Use Jupyter IPython to write a text file containing 'hello world' to '/workspace/test.txt'."
)
test_names=(
"test_edits"
"test_write_simple_script"
"test_ipython"
)
num_of_tests=${#test_names[@]}
num_of_agents=${#agents[@]}
if [ "$num_of_tests" -ne "${#test_names[@]}" ]; then
echo "Every task must correspond to one test case"
exit 1
fi
rm -rf logs
rm -rf $WORKSPACE_BASE
for ((i = 0; i < num_of_tests; i++)); do
task=${tasks[i]}
test_name=${test_names[i]}
# skip other tests if only one test is specified
if [[ -n "$ONLY_TEST_NAME" && "$ONLY_TEST_NAME" != "$test_name" ]]; then
continue
fi
for ((j = 0; j < num_of_agents; j++)); do
agent=${agents[j]}
# skip other agents if only one agent is specified
if [[ -n "$ONLY_TEST_AGENT" && "$ONLY_TEST_AGENT" != "$agent" ]]; then
continue
fi
echo -e "\n\n\n\n========Running $test_name for $agent========\n\n\n\n"
rm -rf $WORKSPACE_BASE
mkdir $WORKSPACE_BASE
if [ -d "tests/integration/workspace/$test_name" ]; then
cp -r tests/integration/workspace/$test_name/* $WORKSPACE_BASE
fi
if [ "$TEST_ONLY" = true ]; then
set -e
else
# Temporarily disable 'exit on error'
set +e
fi
run_test
TEST_STATUS=$?
# Re-enable 'exit on error'
set -e
if [[ $TEST_STATUS -ne 0 ]]; then
echo -e "\n\n\n\n========$test_name failed, regenerating test data for $agent========\n\n\n\n"
sleep 1
rm -rf $WORKSPACE_BASE
mkdir -p $WORKSPACE_BASE
if [ -d "tests/integration/workspace/$test_name" ]; then
cp -r tests/integration/workspace/$test_name/* $WORKSPACE_BASE
fi
rm -rf logs
rm -rf tests/integration/mock/$agent/$test_name/*
# set -x to print the command being executed
set -x
echo -e "/exit\n" | \
SANDBOX_TYPE=$SANDBOX_TYPE \
WORKSPACE_BASE=$WORKSPACE_BASE \
DEBUG=true \
WORKSPACE_MOUNT_PATH=$WORKSPACE_MOUNT_PATH AGENT=$agent \
poetry run python ./opendevin/core/main.py \
-i $MAX_ITERATIONS \
-t "$task Do not ask me for confirmation at any point." \
-c $agent
set +x
mkdir -p tests/integration/mock/$agent/$test_name/
mv logs/llm/**/* tests/integration/mock/$agent/$test_name/
echo -e "\n\n\n\n========$test_name test data regenerated for $agent, rerun test again to verify========\n\n\n\n"
# Temporarily disable 'exit on error'
set +e
run_test
TEST_STATUS=$?
# Re-enable 'exit on error'
set -e
if [[ $TEST_STATUS -ne 0 ]]; then
echo -e "\n\n\n\n========$test_name for $agent RERUN FAILED========\n\n\n\n"
echo -e "There are multiple possibilities:"
echo -e " 1. The agent is unable to finish the task within $MAX_ITERATIONS steps."
echo -e " 2. The agent thinks itself has finished the task, but fails the validation in the test code."
echo -e " 3. There is something non-deterministic in the prompt."
echo -e " 4. There is a bug in this script, or in OpenDevin code."
echo -e "NOTE: Some of the above problems could sometimes be fixed by a retry (with a more powerful LLM)."
echo -e " You could also consider improving the agent, increasing MAX_ITERATIONS, or skipping this test for this agent."
exit 1
else
echo -e "\n\n\n\n========$test_name for $agent RERUN PASSED========\n\n\n\n"
sleep 1
fi
else
echo -e "\n\n\n\n========$test_name for $agent PASSED========\n\n\n\n"
sleep 1
fi
done
done
rm -rf logs
rm -rf $WORKSPACE_BASE
echo "Done!"