[Eval] Improve SWE-Bench Eval harness: multi-run support & entry script simplification (#4396)

2026-03-22 13:47:19 +08:00 · 2024-10-15 08:34:52 -05:00
parent 15df12cf15
commit 50c13aad98
16 changed files with 31 additions and 66 deletions
--- a/evaluation/EDA/scripts/run_infer.sh
+++ b/evaluation/EDA/scripts/run_infer.sh
@@ -50,7 +50,6 @@ COMMAND="poetry run python evaluation/EDA/run_infer.py \
  --data-split test \
  --max-iterations 20 \
  --OPENAI_API_KEY $OPENAI_API_KEY \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS \
  --eval-note ${AGENT_VERSION}_${DATASET}"

--- a/evaluation/agent_bench/scripts/run_infer.sh
+++ b/evaluation/agent_bench/scripts/run_infer.sh
@@ -30,7 +30,6 @@ COMMAND="export PYTHONPATH=evaluation/agent_bench:\$PYTHONPATH && poetry run pyt
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 30 \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS \
  --eval-note $AGENT_VERSION"

--- a/evaluation/aider_bench/scripts/run_infer.sh
+++ b/evaluation/aider_bench/scripts/run_infer.sh
@@ -43,7 +43,6 @@ COMMAND="export PYTHONPATH=evaluation/aider_bench:\$PYTHONPATH && poetry run pyt
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 30 \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS \
  --eval-note $EVAL_NOTE"

--- a/evaluation/biocoder/scripts/run_infer.sh
+++ b/evaluation/biocoder/scripts/run_infer.sh
@@ -32,7 +32,6 @@ COMMAND="poetry run python evaluation/biocoder/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 10 \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS \
  --eval-note ${AGENT_VERSION}_${DATASET}"

--- a/evaluation/bird/scripts/run_infer.sh
+++ b/evaluation/bird/scripts/run_infer.sh
@@ -30,7 +30,6 @@ COMMAND="poetry run python evaluation/bird/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 5 \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS \
  --eval-note $AGENT_VERSION" \

--- a/evaluation/browsing_delegation/scripts/run_infer.sh
+++ b/evaluation/browsing_delegation/scripts/run_infer.sh
@@ -32,7 +32,6 @@ COMMAND="poetry run python evaluation/browsing_delegation/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 1 \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS \
  --eval-note $EVAL_NOTE"

--- a/evaluation/gaia/scripts/run_infer.sh
+++ b/evaluation/gaia/scripts/run_infer.sh
@@ -41,7 +41,6 @@ COMMAND="poetry run python ./evaluation/gaia/run_infer.py \
  --max-iterations 30 \
  --level $LEVELS \
  --data-split validation \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS \
  --eval-note ${AGENT_VERSION}_${LEVELS}"

--- a/evaluation/gorilla/scripts/run_infer.sh
+++ b/evaluation/gorilla/scripts/run_infer.sh
@@ -39,7 +39,6 @@ COMMAND="poetry run python evaluation/gorilla/run_infer.py \
  --max-iterations 30 \
  --hubs $HUBS \
  --data-split validation \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS \
  --eval-note ${AGENT_VERSION}_${LEVELS}"

--- a/evaluation/gpqa/scripts/run_infer.sh
+++ b/evaluation/gpqa/scripts/run_infer.sh
@@ -37,7 +37,6 @@ COMMAND="poetry run python evaluation/gpqa/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 10 \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS \
  --data-split $DATA_SPLIT \
  --eval-note $AGENT_VERSION"
--- a/evaluation/humanevalfix/scripts/run_infer.sh
+++ b/evaluation/humanevalfix/scripts/run_infer.sh
@@ -68,7 +68,6 @@ COMMAND="poetry run python evaluation/humanevalfix/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 10 \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS \
  --eval-note $AGENT_VERSION"

--- a/evaluation/logic_reasoning/scripts/run_infer.sh
+++ b/evaluation/logic_reasoning/scripts/run_infer.sh
@@ -39,7 +39,6 @@ COMMAND="poetry run python evaluation/logic_reasoning/run_infer.py \
  --llm-config $MODEL_CONFIG \
  --dataset $DATASET \
  --max-iterations 10 \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS \
  --eval-note $AGENT_VERSION"

--- a/evaluation/miniwob/scripts/run_infer.sh
+++ b/evaluation/miniwob/scripts/run_infer.sh
@@ -37,7 +37,6 @@ COMMAND="poetry run python evaluation/miniwob/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 10 \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS"

 if [ -n "$EVAL_LIMIT" ]; then
--- a/evaluation/swe_bench/scripts/run_infer.sh
+++ b/evaluation/swe_bench/scripts/run_infer.sh
@@ -11,6 +11,7 @@ MAX_ITER=$5
 NUM_WORKERS=$6
 DATASET=$7
 SPLIT=$8
+N_RUNS=$9

 if [ -z "$NUM_WORKERS" ]; then
  NUM_WORKERS=1
@@ -69,26 +70,37 @@ fi
 if [ -n "$EXP_NAME" ]; then
  EVAL_NOTE="$EVAL_NOTE-$EXP_NAME"
 fi
-echo "EVAL_NOTE: $EVAL_NOTE"
+
+function run_eval() {
+  local eval_note=$1
+  COMMAND="poetry run python evaluation/swe_bench/run_infer.py \
+    --agent-cls $AGENT \
+    --llm-config $MODEL_CONFIG \
+    --max-iterations $MAX_ITER \
+    --eval-num-workers $NUM_WORKERS \
+    --eval-note $eval_note \
+    --dataset $DATASET \
+    --split $SPLIT"
+
+  if [ -n "$EVAL_LIMIT" ]; then
+    echo "EVAL_LIMIT: $EVAL_LIMIT"
+    COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+  fi
+
+  # Run the command
+  eval $COMMAND
+}

 unset SANDBOX_ENV_GITHUB_TOKEN # prevent the agent from using the github token to push
-
-COMMAND="poetry run python evaluation/swe_bench/run_infer.py \
-  --agent-cls $AGENT \
-  --llm-config $MODEL_CONFIG \
-  --max-iterations $MAX_ITER \
-  --max-chars 10000000 \
-  --eval-num-workers $NUM_WORKERS \
-  --eval-note $EVAL_NOTE \
-  --dataset $DATASET \
-  --split $SPLIT"
-
-if [ -n "$EVAL_LIMIT" ]; then
-  echo "EVAL_LIMIT: $EVAL_LIMIT"
-  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+if [ -z "$N_RUNS" ]; then
+  N_RUNS=1
+  echo "N_RUNS not specified, use default $N_RUNS"
 fi

-# Run the command
-eval $COMMAND
+for i in $(seq 1 $N_RUNS); do
+  current_eval_note="$EVAL_NOTE-run_$i"
+  echo "EVAL_NOTE: $current_eval_note"
+  run_eval $current_eval_note
+done

 checkout_original_branch
--- a/evaluation/swe_bench/scripts/setup/instance_swe_entry.sh
+++ b/evaluation/swe_bench/scripts/setup/instance_swe_entry.sh
@@ -1,13 +1,8 @@
 #!/bin/bash

 source ~/.bashrc
-
 SWEUTIL_DIR=/swe_util

-# Create logs directory
-LOG_DIR=/openhands/logs
-mkdir -p $LOG_DIR && chmod 777 $LOG_DIR
-
 # FIXME: Cannot read SWE_INSTANCE_ID from the environment variable
 # SWE_INSTANCE_ID=django__django-11099
 if [ -z "$SWE_INSTANCE_ID" ]; then
@@ -27,15 +22,6 @@ WORKSPACE_NAME=$(echo "$item" | jq -r '(.repo | tostring) + "__" + (.version | t

 echo "WORKSPACE_NAME: $WORKSPACE_NAME"

-SWE_TASK_DIR=/openhands/swe_tasks
-mkdir -p $SWE_TASK_DIR
-# Dump test_patch to /workspace/test.patch
-echo "$item" | jq -r '.test_patch' > $SWE_TASK_DIR/test.patch
-# Dump patch to /workspace/gold.patch
-echo "$item" | jq -r '.patch' > $SWE_TASK_DIR/gold.patch
-# Dump the item to /workspace/instance.json except for the "test_patch" and "patch" fields
-echo "$item" | jq 'del(.test_patch, .patch)' > $SWE_TASK_DIR/instance.json
-
 # Clear the workspace
 if [ -d /workspace ]; then
    rm -rf /workspace/*
@@ -46,28 +32,9 @@ fi
 if [ -d /workspace/$WORKSPACE_NAME ]; then
    rm -rf /workspace/$WORKSPACE_NAME
 fi
-cp -r /testbed/ /workspace/$WORKSPACE_NAME/
-
-# Reset swe-bench testbed and install the repo
-. /opt/miniconda3/etc/profile.d/conda.sh
-conda activate testbed
-
-mkdir -p $SWE_TASK_DIR/reset_testbed_temp
-mkdir -p $SWE_TASK_DIR/reset_testbed_log_dir
-
-REPO_PATH=/workspace/$WORKSPACE_NAME
-echo "Repo Path: $REPO_PATH"
-# echo "Test Command: $TEST_CMD"
-echo "export REPO_PATH=\"$REPO_PATH\"" >> ~/.bashrc
-# echo "export TEST_CMD=\"$TEST_CMD\"" >> ~/.bashrc
-
-if [[ "$REPO_PATH" == "None" ]]; then
-    echo "Error: Failed to retrieve repository path. Tests may not have passed or output was not as expected." >&2
-    exit 1
-fi
+mkdir -p /workspace
+ln -s /testbed /workspace/$WORKSPACE_NAME

 # Activate instance-specific environment
 . /opt/miniconda3/etc/profile.d/conda.sh
 conda activate testbed
-
-# set +e
--- a/evaluation/toolqa/scripts/run_infer.sh
+++ b/evaluation/toolqa/scripts/run_infer.sh
@@ -55,7 +55,6 @@ COMMAND="poetry run python evaluation/toolqa/run_infer.py \
  --hardness $HARDNESS \
  --wolfram_alpha_appid $WOLFRAM_APPID\
  --data-split validation \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS \
  --eval-note ${AGENT_VERSION}_${LEVELS}"

--- a/evaluation/webarena/scripts/run_infer.sh
+++ b/evaluation/webarena/scripts/run_infer.sh
@@ -39,7 +39,6 @@ COMMAND="poetry run python evaluation/webarena/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 15 \
-  --max-chars 10000000 \
  --eval-num-workers $NUM_WORKERS \
  --eval-note $EVAL_NOTE"