Add option to run patch evaluation on Modal (#8607)

Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
2025-12-26 05:48:36 +08:00 · 2025-05-23 00:45:45 +07:00 · 2025-05-23 00:45:45 +07:00 · 3980ba53c9
commit 3980ba53c9
parent be78cc07bd
2 changed files with 20 additions and 2 deletions
--- a/evaluation/benchmarks/swe_bench/README.md
+++ b/evaluation/benchmarks/swe_bench/README.md
@ -158,6 +158,8 @@ The script now accepts optional arguments:
 - `instance_id`: Specify a single instance to evaluate (optional)
 - `dataset_name`: The name of the dataset to use (default: `"princeton-nlp/SWE-bench_Lite"`)
 - `split`: The split of the dataset to use (default: `"test"`)
+- `environment`: The environment to use for patch evaluation (default: `"local"`). You can set it to
+  `"modal"` to use [official SWE-Bench support](https://github.com/swe-bench/SWE-bench/blob/main/docs/assets/evaluation.md#%EF%B8%8F-evaluation-with-modal) for running evaluation on Modal.

 For example, to evaluate a specific instance with a custom dataset and split:

--- a/evaluation/benchmarks/swe_bench/scripts/eval_infer.sh
+++ b/evaluation/benchmarks/swe_bench/scripts/eval_infer.sh
@ -16,11 +16,19 @@ fi
 INSTANCE_ID=$2
 DATASET_NAME=${3:-"princeton-nlp/SWE-bench_Lite"}
 SPLIT=${4:-"test"}
+ENVIRONMENT=${5:-"local"}

 echo "INSTANCE_ID: $INSTANCE_ID"
 echo "DATASET_NAME: $DATASET_NAME"
 echo "SPLIT: $SPLIT"

+if [[ "$ENVIRONMENT" != "local" && "$ENVIRONMENT" != "modal" ]]; then
+    echo "Error: ENVIRONMENT must be either 'local' or 'modal'"
+    exit 1
+fi
+
+echo "ENVIRONMENT: $ENVIRONMENT"
+
 PROCESS_FILEPATH=$(realpath $PROCESS_FILEPATH)
 FILE_DIR=$(dirname $PROCESS_FILEPATH)
 FILE_NAME=$(basename $PROCESS_FILEPATH)
@ -78,6 +86,12 @@ echo "=============================================================="
 RUN_ID=$(date +"%Y%m%d_%H%M%S")
 N_PROCESS=4

+
+MODAL_FLAG=""
+if [[ "$ENVIRONMENT" == "modal" ]]; then
+    MODAL_FLAG="--modal true"
+fi
+
 if [ -z "$INSTANCE_ID" ]; then
    echo "Running SWE-bench evaluation on the whole input file..."
    # Default to SWE-Bench-lite
@ -90,7 +104,8 @@ if [ -z "$INSTANCE_ID" ]; then
        --timeout 3600 \
        --cache_level instance \
        --max_workers $N_PROCESS \
-        --run_id $RUN_ID
+        --run_id $RUN_ID \
+        $MODAL_FLAG

    # get the "model_name_or_path" from the first line of the SWEBENCH_FORMAT_JSONL
    MODEL_NAME_OR_PATH=$(jq -r '.model_name_or_path' $SWEBENCH_FORMAT_JSONL | head -n 1)
@ -137,5 +152,6 @@ else
        --instance_ids $INSTANCE_ID \
        --cache_level instance \
        --max_workers $N_PROCESS \
-        --run_id $RUN_ID
+        --run_id $RUN_ID \
+        $MODAL_FLAG
 fi