mirror of
https://github.com/OpenHands/OpenHands.git
synced 2026-03-22 13:47:19 +08:00
[Eval] Improve SWE-Bench Eval harness: multi-run support & entry script simplification (#4396)
This commit is contained in:
@@ -50,7 +50,6 @@ COMMAND="poetry run python evaluation/EDA/run_infer.py \
|
||||
--data-split test \
|
||||
--max-iterations 20 \
|
||||
--OPENAI_API_KEY $OPENAI_API_KEY \
|
||||
--max-chars 10000000 \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note ${AGENT_VERSION}_${DATASET}"
|
||||
|
||||
|
||||
@@ -30,7 +30,6 @@ COMMAND="export PYTHONPATH=evaluation/agent_bench:\$PYTHONPATH && poetry run pyt
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 30 \
|
||||
--max-chars 10000000 \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note $AGENT_VERSION"
|
||||
|
||||
|
||||
@@ -43,7 +43,6 @@ COMMAND="export PYTHONPATH=evaluation/aider_bench:\$PYTHONPATH && poetry run pyt
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 30 \
|
||||
--max-chars 10000000 \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note $EVAL_NOTE"
|
||||
|
||||
|
||||
@@ -32,7 +32,6 @@ COMMAND="poetry run python evaluation/biocoder/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 10 \
|
||||
--max-chars 10000000 \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note ${AGENT_VERSION}_${DATASET}"
|
||||
|
||||
|
||||
@@ -30,7 +30,6 @@ COMMAND="poetry run python evaluation/bird/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 5 \
|
||||
--max-chars 10000000 \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note $AGENT_VERSION" \
|
||||
|
||||
|
||||
@@ -32,7 +32,6 @@ COMMAND="poetry run python evaluation/browsing_delegation/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 1 \
|
||||
--max-chars 10000000 \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note $EVAL_NOTE"
|
||||
|
||||
|
||||
@@ -41,7 +41,6 @@ COMMAND="poetry run python ./evaluation/gaia/run_infer.py \
|
||||
--max-iterations 30 \
|
||||
--level $LEVELS \
|
||||
--data-split validation \
|
||||
--max-chars 10000000 \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note ${AGENT_VERSION}_${LEVELS}"
|
||||
|
||||
|
||||
@@ -39,7 +39,6 @@ COMMAND="poetry run python evaluation/gorilla/run_infer.py \
|
||||
--max-iterations 30 \
|
||||
--hubs $HUBS \
|
||||
--data-split validation \
|
||||
--max-chars 10000000 \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note ${AGENT_VERSION}_${LEVELS}"
|
||||
|
||||
|
||||
@@ -37,7 +37,6 @@ COMMAND="poetry run python evaluation/gpqa/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 10 \
|
||||
--max-chars 10000000 \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--data-split $DATA_SPLIT \
|
||||
--eval-note $AGENT_VERSION"
|
||||
|
||||
@@ -68,7 +68,6 @@ COMMAND="poetry run python evaluation/humanevalfix/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 10 \
|
||||
--max-chars 10000000 \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note $AGENT_VERSION"
|
||||
|
||||
|
||||
@@ -39,7 +39,6 @@ COMMAND="poetry run python evaluation/logic_reasoning/run_infer.py \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--dataset $DATASET \
|
||||
--max-iterations 10 \
|
||||
--max-chars 10000000 \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note $AGENT_VERSION"
|
||||
|
||||
|
||||
@@ -37,7 +37,6 @@ COMMAND="poetry run python evaluation/miniwob/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 10 \
|
||||
--max-chars 10000000 \
|
||||
--eval-num-workers $NUM_WORKERS"
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
|
||||
@@ -11,6 +11,7 @@ MAX_ITER=$5
|
||||
NUM_WORKERS=$6
|
||||
DATASET=$7
|
||||
SPLIT=$8
|
||||
N_RUNS=$9
|
||||
|
||||
if [ -z "$NUM_WORKERS" ]; then
|
||||
NUM_WORKERS=1
|
||||
@@ -69,26 +70,37 @@ fi
|
||||
if [ -n "$EXP_NAME" ]; then
|
||||
EVAL_NOTE="$EVAL_NOTE-$EXP_NAME"
|
||||
fi
|
||||
echo "EVAL_NOTE: $EVAL_NOTE"
|
||||
|
||||
function run_eval() {
|
||||
local eval_note=$1
|
||||
COMMAND="poetry run python evaluation/swe_bench/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations $MAX_ITER \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note $eval_note \
|
||||
--dataset $DATASET \
|
||||
--split $SPLIT"
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
|
||||
fi
|
||||
|
||||
# Run the command
|
||||
eval $COMMAND
|
||||
}
|
||||
|
||||
unset SANDBOX_ENV_GITHUB_TOKEN # prevent the agent from using the github token to push
|
||||
|
||||
COMMAND="poetry run python evaluation/swe_bench/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations $MAX_ITER \
|
||||
--max-chars 10000000 \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note $EVAL_NOTE \
|
||||
--dataset $DATASET \
|
||||
--split $SPLIT"
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
|
||||
if [ -z "$N_RUNS" ]; then
|
||||
N_RUNS=1
|
||||
echo "N_RUNS not specified, use default $N_RUNS"
|
||||
fi
|
||||
|
||||
# Run the command
|
||||
eval $COMMAND
|
||||
for i in $(seq 1 $N_RUNS); do
|
||||
current_eval_note="$EVAL_NOTE-run_$i"
|
||||
echo "EVAL_NOTE: $current_eval_note"
|
||||
run_eval $current_eval_note
|
||||
done
|
||||
|
||||
checkout_original_branch
|
||||
|
||||
@@ -1,13 +1,8 @@
|
||||
#!/bin/bash
|
||||
|
||||
source ~/.bashrc
|
||||
|
||||
SWEUTIL_DIR=/swe_util
|
||||
|
||||
# Create logs directory
|
||||
LOG_DIR=/openhands/logs
|
||||
mkdir -p $LOG_DIR && chmod 777 $LOG_DIR
|
||||
|
||||
# FIXME: Cannot read SWE_INSTANCE_ID from the environment variable
|
||||
# SWE_INSTANCE_ID=django__django-11099
|
||||
if [ -z "$SWE_INSTANCE_ID" ]; then
|
||||
@@ -27,15 +22,6 @@ WORKSPACE_NAME=$(echo "$item" | jq -r '(.repo | tostring) + "__" + (.version | t
|
||||
|
||||
echo "WORKSPACE_NAME: $WORKSPACE_NAME"
|
||||
|
||||
SWE_TASK_DIR=/openhands/swe_tasks
|
||||
mkdir -p $SWE_TASK_DIR
|
||||
# Dump test_patch to /workspace/test.patch
|
||||
echo "$item" | jq -r '.test_patch' > $SWE_TASK_DIR/test.patch
|
||||
# Dump patch to /workspace/gold.patch
|
||||
echo "$item" | jq -r '.patch' > $SWE_TASK_DIR/gold.patch
|
||||
# Dump the item to /workspace/instance.json except for the "test_patch" and "patch" fields
|
||||
echo "$item" | jq 'del(.test_patch, .patch)' > $SWE_TASK_DIR/instance.json
|
||||
|
||||
# Clear the workspace
|
||||
if [ -d /workspace ]; then
|
||||
rm -rf /workspace/*
|
||||
@@ -46,28 +32,9 @@ fi
|
||||
if [ -d /workspace/$WORKSPACE_NAME ]; then
|
||||
rm -rf /workspace/$WORKSPACE_NAME
|
||||
fi
|
||||
cp -r /testbed/ /workspace/$WORKSPACE_NAME/
|
||||
|
||||
# Reset swe-bench testbed and install the repo
|
||||
. /opt/miniconda3/etc/profile.d/conda.sh
|
||||
conda activate testbed
|
||||
|
||||
mkdir -p $SWE_TASK_DIR/reset_testbed_temp
|
||||
mkdir -p $SWE_TASK_DIR/reset_testbed_log_dir
|
||||
|
||||
REPO_PATH=/workspace/$WORKSPACE_NAME
|
||||
echo "Repo Path: $REPO_PATH"
|
||||
# echo "Test Command: $TEST_CMD"
|
||||
echo "export REPO_PATH=\"$REPO_PATH\"" >> ~/.bashrc
|
||||
# echo "export TEST_CMD=\"$TEST_CMD\"" >> ~/.bashrc
|
||||
|
||||
if [[ "$REPO_PATH" == "None" ]]; then
|
||||
echo "Error: Failed to retrieve repository path. Tests may not have passed or output was not as expected." >&2
|
||||
exit 1
|
||||
fi
|
||||
mkdir -p /workspace
|
||||
ln -s /testbed /workspace/$WORKSPACE_NAME
|
||||
|
||||
# Activate instance-specific environment
|
||||
. /opt/miniconda3/etc/profile.d/conda.sh
|
||||
conda activate testbed
|
||||
|
||||
# set +e
|
||||
|
||||
@@ -55,7 +55,6 @@ COMMAND="poetry run python evaluation/toolqa/run_infer.py \
|
||||
--hardness $HARDNESS \
|
||||
--wolfram_alpha_appid $WOLFRAM_APPID\
|
||||
--data-split validation \
|
||||
--max-chars 10000000 \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note ${AGENT_VERSION}_${LEVELS}"
|
||||
|
||||
|
||||
@@ -39,7 +39,6 @@ COMMAND="poetry run python evaluation/webarena/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 15 \
|
||||
--max-chars 10000000 \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note $EVAL_NOTE"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user