mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
134 lines
4.6 KiB
Bash
Executable File
134 lines
4.6 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
# NOTE: this script is for rolling out the SWE-Gym dataset for **TRAINING**
|
|
# For more information, please refer to
|
|
# 1. the Github Repo: https://github.com/SWE-Gym/SWE-Gym
|
|
# 2. the paper: https://arxiv.org/abs/2412.21139
|
|
|
|
MODEL=$1 # eg your llm config name in config.toml (eg: "llm.claude-3-5-sonnet-20241022-t05")
|
|
EXP_NAME=$2 # "train-t05"
|
|
N_WORKERS=${3:-64}
|
|
N_RUNS=${4:-1}
|
|
|
|
export EXP_NAME=$EXP_NAME
|
|
# use 2x resources for rollout since some codebases are pretty resource-intensive
|
|
export DEFAULT_RUNTIME_RESOURCE_FACTOR=2
|
|
export ITERATIVE_EVAL_MODE=false
|
|
echo "MODEL: $MODEL"
|
|
echo "EXP_NAME: $EXP_NAME"
|
|
DATASET="SWE-Gym/SWE-Gym" # change this to the "/SWE-Gym-Lite" if you want to rollout the lite subset
|
|
SPLIT="train"
|
|
|
|
if [ -z "$ALLHANDS_API_KEY" ]; then
|
|
echo "ALLHANDS_API_KEY is not set. Will rollout and evaluate locally using Docker. WARNING: A large value of N_WORKERS will result in a large number of Docker containers being spun up and may crash your machine."
|
|
export RUNTIME=docker
|
|
else
|
|
echo "ALLHANDS_API_KEY is set. Continuing rollout and evaluation with remote runtime..."
|
|
export RUNTIME=remote
|
|
export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
|
|
export EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images"
|
|
fi
|
|
|
|
EVAL_LIMIT=3000
|
|
MAX_ITER=100
|
|
|
|
|
|
# ===== Run inference =====
|
|
source "evaluation/utils/version_control.sh"
|
|
get_openhands_version
|
|
|
|
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
|
|
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
|
echo "DATASET: $DATASET"
|
|
echo "SPLIT: $SPLIT"
|
|
|
|
# Default to NOT use Hint
|
|
export USE_INSTANCE_IMAGE=true
|
|
export USE_HINT_TEXT=false
|
|
export RUN_WITH_BROWSING=false
|
|
echo "USE_HINT_TEXT: $USE_HINT_TEXT"
|
|
EVAL_NOTE="$OPENHANDS_VERSION-no-hint-$EXP_NAME"
|
|
|
|
function run_eval() {
|
|
local eval_note=$1
|
|
COMMAND="poetry run python evaluation/benchmarks/swe_bench/run_infer.py \
|
|
--agent-cls CodeActAgent \
|
|
--llm-config $MODEL \
|
|
--max-iterations $MAX_ITER \
|
|
--eval-num-workers $N_WORKERS \
|
|
--eval-note $eval_note \
|
|
--dataset $DATASET \
|
|
--split $SPLIT"
|
|
|
|
if [ -n "$EVAL_LIMIT" ]; then
|
|
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
|
COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
|
|
fi
|
|
|
|
# Run the command
|
|
eval $COMMAND
|
|
}
|
|
|
|
for run_idx in $(seq 1 $N_RUNS); do
|
|
|
|
while true; do
|
|
echo "### Running inference... ###"
|
|
unset SANDBOX_ENV_GITHUB_TOKEN # prevent the agent from using the github token to push
|
|
current_eval_note="$EVAL_NOTE-run_$run_idx"
|
|
echo "EVAL_NOTE: $current_eval_note"
|
|
INFER_OUTPUT=$(run_eval $current_eval_note)
|
|
INFER_STATUS=$? # Capture the exit status of run_infer.sh
|
|
echo "INFER_STATUS: $INFER_STATUS"
|
|
|
|
echo "### Cleaning up remote runtime... ###"
|
|
./evaluation/utils/scripts/cleanup_remote_runtime.sh
|
|
|
|
if [ $INFER_STATUS -eq 0 ]; then
|
|
echo "### Inference completed successfully. ###"
|
|
break
|
|
else
|
|
echo "### Inference failed with exit code $INFER_STATUS. Retrying... ###"
|
|
fi
|
|
done
|
|
|
|
# Extract the output directory using the special delimiters
|
|
OUTPUT_FILE=$(echo "$INFER_OUTPUT" | grep -o '### OUTPUT FILE:.* ###' | sed 's/### OUTPUT FILE: \(.*\) ###/\1/')
|
|
echo "Got OUTPUT_FILE: $OUTPUT_FILE"
|
|
|
|
while true; do
|
|
echo "### Evaluating on $OUTPUT_FILE ... ###"
|
|
COMMAND="poetry run python evaluation/benchmarks/swe_bench/eval_infer.py \
|
|
--eval-num-workers $((N_WORKERS * 2)) \
|
|
--input-file $OUTPUT_FILE \
|
|
--dataset $DATASET \
|
|
--split $SPLIT"
|
|
|
|
if [ -n "$EVAL_LIMIT" ]; then
|
|
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
|
COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
|
|
fi
|
|
echo "Running command: $COMMAND"
|
|
# Run the command
|
|
eval $COMMAND
|
|
EVAL_STATUS=$?
|
|
if [ $EVAL_STATUS -eq 0 ]; then
|
|
echo "### Evaluation completed successfully. ###"
|
|
break
|
|
else
|
|
echo "### Evaluation failed with exit code $EVAL_STATUS. Retrying... ###"
|
|
fi
|
|
|
|
./evaluation/utils/scripts/cleanup_remote_runtime.sh
|
|
done
|
|
|
|
# update the output with evaluation results
|
|
echo "### Updating the output with evaluation results... ###"
|
|
poetry run python evaluation/benchmarks/swe_bench/scripts/eval/update_output_with_eval.py $OUTPUT_FILE
|
|
|
|
echo "### Combining the final completions... ###"
|
|
poetry run python evaluation/benchmarks/swe_bench/scripts/eval/combine_final_completions.py $OUTPUT_FILE
|
|
|
|
echo "### DONE for run $run_idx! ###"
|
|
echo "You can find the final output at $(dirname $OUTPUT_FILE)/$FINAL_OUTPUT_FILE"
|
|
done
|