[Eval] Improve SWE-Bench Eval harness: multi-run support & entry script simplification (#4396)

This commit is contained in:
Xingyao Wang
2024-10-15 08:34:52 -05:00
committed by GitHub
parent 15df12cf15
commit 50c13aad98
16 changed files with 31 additions and 66 deletions

View File

@@ -50,7 +50,6 @@ COMMAND="poetry run python evaluation/EDA/run_infer.py \
--data-split test \
--max-iterations 20 \
--OPENAI_API_KEY $OPENAI_API_KEY \
--max-chars 10000000 \
--eval-num-workers $NUM_WORKERS \
--eval-note ${AGENT_VERSION}_${DATASET}"

View File

@@ -30,7 +30,6 @@ COMMAND="export PYTHONPATH=evaluation/agent_bench:\$PYTHONPATH && poetry run pyt
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations 30 \
--max-chars 10000000 \
--eval-num-workers $NUM_WORKERS \
--eval-note $AGENT_VERSION"

View File

@@ -43,7 +43,6 @@ COMMAND="export PYTHONPATH=evaluation/aider_bench:\$PYTHONPATH && poetry run pyt
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations 30 \
--max-chars 10000000 \
--eval-num-workers $NUM_WORKERS \
--eval-note $EVAL_NOTE"

View File

@@ -32,7 +32,6 @@ COMMAND="poetry run python evaluation/biocoder/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations 10 \
--max-chars 10000000 \
--eval-num-workers $NUM_WORKERS \
--eval-note ${AGENT_VERSION}_${DATASET}"

View File

@@ -30,7 +30,6 @@ COMMAND="poetry run python evaluation/bird/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations 5 \
--max-chars 10000000 \
--eval-num-workers $NUM_WORKERS \
--eval-note $AGENT_VERSION" \

View File

@@ -32,7 +32,6 @@ COMMAND="poetry run python evaluation/browsing_delegation/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations 1 \
--max-chars 10000000 \
--eval-num-workers $NUM_WORKERS \
--eval-note $EVAL_NOTE"

View File

@@ -41,7 +41,6 @@ COMMAND="poetry run python ./evaluation/gaia/run_infer.py \
--max-iterations 30 \
--level $LEVELS \
--data-split validation \
--max-chars 10000000 \
--eval-num-workers $NUM_WORKERS \
--eval-note ${AGENT_VERSION}_${LEVELS}"

View File

@@ -39,7 +39,6 @@ COMMAND="poetry run python evaluation/gorilla/run_infer.py \
--max-iterations 30 \
--hubs $HUBS \
--data-split validation \
--max-chars 10000000 \
--eval-num-workers $NUM_WORKERS \
--eval-note ${AGENT_VERSION}_${LEVELS}"

View File

@@ -37,7 +37,6 @@ COMMAND="poetry run python evaluation/gpqa/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations 10 \
--max-chars 10000000 \
--eval-num-workers $NUM_WORKERS \
--data-split $DATA_SPLIT \
--eval-note $AGENT_VERSION"

View File

@@ -68,7 +68,6 @@ COMMAND="poetry run python evaluation/humanevalfix/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations 10 \
--max-chars 10000000 \
--eval-num-workers $NUM_WORKERS \
--eval-note $AGENT_VERSION"

View File

@@ -39,7 +39,6 @@ COMMAND="poetry run python evaluation/logic_reasoning/run_infer.py \
--llm-config $MODEL_CONFIG \
--dataset $DATASET \
--max-iterations 10 \
--max-chars 10000000 \
--eval-num-workers $NUM_WORKERS \
--eval-note $AGENT_VERSION"

View File

@@ -37,7 +37,6 @@ COMMAND="poetry run python evaluation/miniwob/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations 10 \
--max-chars 10000000 \
--eval-num-workers $NUM_WORKERS"
if [ -n "$EVAL_LIMIT" ]; then

View File

@@ -11,6 +11,7 @@ MAX_ITER=$5
NUM_WORKERS=$6
DATASET=$7
SPLIT=$8
N_RUNS=$9
if [ -z "$NUM_WORKERS" ]; then
NUM_WORKERS=1
@@ -69,26 +70,37 @@ fi
if [ -n "$EXP_NAME" ]; then
EVAL_NOTE="$EVAL_NOTE-$EXP_NAME"
fi
echo "EVAL_NOTE: $EVAL_NOTE"
function run_eval() {
local eval_note=$1
COMMAND="poetry run python evaluation/swe_bench/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations $MAX_ITER \
--eval-num-workers $NUM_WORKERS \
--eval-note $eval_note \
--dataset $DATASET \
--split $SPLIT"
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"
COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
fi
# Run the command
eval $COMMAND
}
unset SANDBOX_ENV_GITHUB_TOKEN # prevent the agent from using the github token to push
COMMAND="poetry run python evaluation/swe_bench/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations $MAX_ITER \
--max-chars 10000000 \
--eval-num-workers $NUM_WORKERS \
--eval-note $EVAL_NOTE \
--dataset $DATASET \
--split $SPLIT"
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"
COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
if [ -z "$N_RUNS" ]; then
N_RUNS=1
echo "N_RUNS not specified, use default $N_RUNS"
fi
# Run the command
eval $COMMAND
for i in $(seq 1 $N_RUNS); do
current_eval_note="$EVAL_NOTE-run_$i"
echo "EVAL_NOTE: $current_eval_note"
run_eval $current_eval_note
done
checkout_original_branch

View File

@@ -1,13 +1,8 @@
#!/bin/bash
source ~/.bashrc
SWEUTIL_DIR=/swe_util
# Create logs directory
LOG_DIR=/openhands/logs
mkdir -p $LOG_DIR && chmod 777 $LOG_DIR
# FIXME: Cannot read SWE_INSTANCE_ID from the environment variable
# SWE_INSTANCE_ID=django__django-11099
if [ -z "$SWE_INSTANCE_ID" ]; then
@@ -27,15 +22,6 @@ WORKSPACE_NAME=$(echo "$item" | jq -r '(.repo | tostring) + "__" + (.version | t
echo "WORKSPACE_NAME: $WORKSPACE_NAME"
SWE_TASK_DIR=/openhands/swe_tasks
mkdir -p $SWE_TASK_DIR
# Dump test_patch to /workspace/test.patch
echo "$item" | jq -r '.test_patch' > $SWE_TASK_DIR/test.patch
# Dump patch to /workspace/gold.patch
echo "$item" | jq -r '.patch' > $SWE_TASK_DIR/gold.patch
# Dump the item to /workspace/instance.json except for the "test_patch" and "patch" fields
echo "$item" | jq 'del(.test_patch, .patch)' > $SWE_TASK_DIR/instance.json
# Clear the workspace
if [ -d /workspace ]; then
rm -rf /workspace/*
@@ -46,28 +32,9 @@ fi
if [ -d /workspace/$WORKSPACE_NAME ]; then
rm -rf /workspace/$WORKSPACE_NAME
fi
cp -r /testbed/ /workspace/$WORKSPACE_NAME/
# Reset swe-bench testbed and install the repo
. /opt/miniconda3/etc/profile.d/conda.sh
conda activate testbed
mkdir -p $SWE_TASK_DIR/reset_testbed_temp
mkdir -p $SWE_TASK_DIR/reset_testbed_log_dir
REPO_PATH=/workspace/$WORKSPACE_NAME
echo "Repo Path: $REPO_PATH"
# echo "Test Command: $TEST_CMD"
echo "export REPO_PATH=\"$REPO_PATH\"" >> ~/.bashrc
# echo "export TEST_CMD=\"$TEST_CMD\"" >> ~/.bashrc
if [[ "$REPO_PATH" == "None" ]]; then
echo "Error: Failed to retrieve repository path. Tests may not have passed or output was not as expected." >&2
exit 1
fi
mkdir -p /workspace
ln -s /testbed /workspace/$WORKSPACE_NAME
# Activate instance-specific environment
. /opt/miniconda3/etc/profile.d/conda.sh
conda activate testbed
# set +e

View File

@@ -55,7 +55,6 @@ COMMAND="poetry run python evaluation/toolqa/run_infer.py \
--hardness $HARDNESS \
--wolfram_alpha_appid $WOLFRAM_APPID\
--data-split validation \
--max-chars 10000000 \
--eval-num-workers $NUM_WORKERS \
--eval-note ${AGENT_VERSION}_${LEVELS}"

View File

@@ -39,7 +39,6 @@ COMMAND="poetry run python evaluation/webarena/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations 15 \
--max-chars 10000000 \
--eval-num-workers $NUM_WORKERS \
--eval-note $EVAL_NOTE"