diff --git a/evaluation/benchmarks/swe_bench/README.md b/evaluation/benchmarks/swe_bench/README.md index a1bc26fb7f..f51c56462e 100644 --- a/evaluation/benchmarks/swe_bench/README.md +++ b/evaluation/benchmarks/swe_bench/README.md @@ -2,6 +2,8 @@ This folder contains the evaluation harness that we built on top of the original [SWE-Bench benchmark](https://www.swebench.com/) ([paper](https://arxiv.org/abs/2310.06770)). +**UPDATE (8/12/2025): We now support running SWE-rebench evaluation (see the paper [here](https://arxiv.org/abs/2505.20411))! For how to run it, checkout [this README](./SWE-rebench.md).** + **UPDATE (6/15/2025): We now support running SWE-bench-Live evaluation (see the paper [here](https://arxiv.org/abs/2505.23419))! For how to run it, checkout [this README](./SWE-bench-Live.md).** **UPDATE (5/26/2025): We now support running interactive SWE-Bench evaluation (see the paper [here](https://arxiv.org/abs/2502.13069))! For how to run it, checkout [this README](./SWE-Interact.md).** diff --git a/evaluation/benchmarks/swe_bench/SWE-rebench.md b/evaluation/benchmarks/swe_bench/SWE-rebench.md new file mode 100644 index 0000000000..9e510e681b --- /dev/null +++ b/evaluation/benchmarks/swe_bench/SWE-rebench.md @@ -0,0 +1,84 @@ +# SWE-rebench + +

+📃 Paper +• +🤗 HuggingFace +• +📊 Leaderboard +

+ +SWE-rebench is a large-scale dataset for verifiable software engineering tasks. +It comes in **two datasets**: + +* **[`nebius/SWE-rebench-leaderboard`](https://huggingface.co/datasets/nebius/SWE-rebench-leaderboard)** – updatable benchmark used for [leaderboard evaluation](https://swe-rebench.com/leaderboard). +* **[`nebius/SWE-rebench`](https://huggingface.co/datasets/nebius/SWE-rebench)** – full dataset with **21,302 tasks**, suitable for training or large-scale offline evaluation. + +This document explains how to run OpenHands on SWE-rebench, using the leaderboard split as the main example. +To run on the full dataset, simply replace the dataset name. + + +## Setting Up + +Set up your development environment and configure your LLM provider by following the [SWE-bench README](README.md) in this directory. + + +## Running Inference + +Use the existing SWE-bench inference script, changing the dataset to `nebius/SWE-rebench-leaderboard` and selecting the split (`test` for leaderboard submission): + +```bash +./evaluation/benchmarks/swe_bench/scripts/run_infer.sh \ + llm.your_llm HEAD CodeActAgent 30 50 1 nebius/SWE-rebench-leaderboard test +``` + +Arguments: + +* `llm.your_llm` – your model configuration key +* `HEAD` – commit reference for reproducibility +* `CodeActAgent` – agent type +* `10` – number of examples to evaluate +* `50` – maximum iterations per task (increase if needed) +* `1` – number of workers +* `nebius/SWE-rebench-leaderboard` – Hugging Face dataset name +* `test` – dataset split + +**Tip:** To run on the **full 21k dataset**, replace `nebius/SWE-rebench-leaderboard` with `nebius/SWE-rebench`. + + +## Evaluating Results + +After inference completes, evaluate using the [SWE-bench-fork evaluation harness](https://github.com/SWE-rebench/SWE-bench-fork). + +1. Convert the OpenHands output to SWE-bench evaluation format: + +```bash +python evaluation/benchmarks/swe_bench/scripts/live/convert.py \ + --output_jsonl path/to/evaluation/output.jsonl > preds.jsonl +``` + +2. Clone the SWE-bench-fork repo (https://github.com/SWE-rebench/SWE-bench-fork) and follow its README to install dependencies. + + +3. Run the evaluation using the fork: + +```bash +python -m swebench.harness.run_evaluation \ + --dataset_name nebius/SWE-rebench-leaderboard \ + --split test \ + --predictions_path preds.jsonl \ + --max_workers 10 \ + --run_id openhands +``` + + +## Citation + +```bibtex +@article{badertdinov2025swerebench, + title={SWE-rebench: An Automated Pipeline for Task Collection and Decontaminated Evaluation of Software Engineering Agents}, + author={Badertdinov, Ibragim and Golubev, Alexander and Nekrashevich, Maksim and Shevtsov, Anton and Karasik, Simon and Andriushchenko, Andrei and Trofimova, Maria and Litvintseva, Daria and Yangel, Boris}, + journal={arXiv preprint arXiv:2505.20411}, + year={2025} +} +``` diff --git a/evaluation/benchmarks/swe_bench/run_infer.py b/evaluation/benchmarks/swe_bench/run_infer.py index 5272c58a19..715e84a354 100644 --- a/evaluation/benchmarks/swe_bench/run_infer.py +++ b/evaluation/benchmarks/swe_bench/run_infer.py @@ -80,6 +80,8 @@ def set_dataset_type(dataset_name: str) -> str: DATASET_TYPE = 'SWE-Gym' elif 'swe-bench-live' in name_lower: DATASET_TYPE = 'SWE-bench-Live' + elif 'swe-rebench' in name_lower: + DATASET_TYPE = 'SWE-rebench' elif 'multimodal' in name_lower: DATASET_TYPE = 'Multimodal' else: @@ -178,6 +180,8 @@ def get_instance_docker_image( docker_image_prefix = 'docker.io/starryzhang/' elif DATASET_TYPE == 'SWE-bench': docker_image_prefix = 'docker.io/swebench/' + elif DATASET_TYPE == 'SWE-rebench': + docker_image_prefix = 'docker.io/swerebench/' repo, name = instance_id.split('__') image_name = f'{docker_image_prefix.rstrip("/")}/sweb.eval.x86_64.{repo}_1776_{name}:latest'.lower() logger.debug(f'Using official SWE-Bench image: {image_name}') @@ -318,6 +322,8 @@ def initialize_runtime( # inject the instance swe entry if DATASET_TYPE == 'SWE-bench-Live': entry_script_path = 'instance_swe_entry_live.sh' + elif DATASET_TYPE == 'SWE-rebench': + entry_script_path = 'instance_swe_entry_rebench.sh' else: entry_script_path = 'instance_swe_entry.sh' runtime.copy_to( diff --git a/evaluation/benchmarks/swe_bench/scripts/setup/instance_swe_entry_rebench.sh b/evaluation/benchmarks/swe_bench/scripts/setup/instance_swe_entry_rebench.sh new file mode 100644 index 0000000000..7ea5d5659b --- /dev/null +++ b/evaluation/benchmarks/swe_bench/scripts/setup/instance_swe_entry_rebench.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash + +source ~/.bashrc +SWEUTIL_DIR=/swe_util + +# FIXME: Cannot read SWE_INSTANCE_ID from the environment variable +# SWE_INSTANCE_ID=django__django-11099 +if [ -z "$SWE_INSTANCE_ID" ]; then + echo "Error: SWE_INSTANCE_ID is not set." >&2 + exit 1 +fi + +# Read the swe-bench-test-lite.json file and extract the required item based on instance_id +item=$(jq --arg INSTANCE_ID "$SWE_INSTANCE_ID" '.[] | select(.instance_id == $INSTANCE_ID)' $SWEUTIL_DIR/eval_data/instances/swe-bench-instance.json) + +if [[ -z "$item" ]]; then + echo "No item found for the provided instance ID." + exit 1 +fi + + +WORKSPACE_NAME=$(echo "$item" | jq -r '(.repo | tostring) + "__" + (.version | tostring) | gsub("/"; "__")') + +echo "WORKSPACE_NAME: $WORKSPACE_NAME" + +# Clear the workspace +if [ -d /workspace ]; then + rm -rf /workspace/* +else + mkdir /workspace +fi +# Copy repo to workspace +if [ -d /workspace/$WORKSPACE_NAME ]; then + rm -rf /workspace/$WORKSPACE_NAME +fi +mkdir -p /workspace +cp -r /testbed /workspace/$WORKSPACE_NAME + +# Activate instance-specific environment +if [ -d /opt/miniconda3 ]; then + . /opt/miniconda3/etc/profile.d/conda.sh + conda activate testbed +fi + +export PATH=/opt/conda/envs/testbed/bin:$PATH diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py index aeb563240d..12bc3540eb 100644 --- a/evaluation/utils/shared.py +++ b/evaluation/utils/shared.py @@ -263,19 +263,20 @@ def prepare_dataset( f'Randomly sampling {eval_n_limit} unique instances with random seed 42.' ) - def make_serializable(instance: pd.Series) -> dict: + def make_serializable(instance_dict: dict) -> dict: import numpy as np - instance_dict = instance.to_dict() for k, v in instance_dict.items(): if isinstance(v, np.ndarray): instance_dict[k] = v.tolist() elif isinstance(v, pd.Timestamp): instance_dict[k] = str(v) + elif isinstance(v, dict): + instance_dict[k] = make_serializable(v) return instance_dict new_dataset = [ - make_serializable(instance) + make_serializable(instance.to_dict()) for _, instance in dataset.iterrows() if str(instance[id_column]) not in finished_ids ]