mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
feat(eval): Support evaluation on SWE-rebench (#10251)
This commit is contained in:
parent
2b7e44819f
commit
19a6b6b618
@ -2,6 +2,8 @@
|
||||
|
||||
This folder contains the evaluation harness that we built on top of the original [SWE-Bench benchmark](https://www.swebench.com/) ([paper](https://arxiv.org/abs/2310.06770)).
|
||||
|
||||
**UPDATE (8/12/2025): We now support running SWE-rebench evaluation (see the paper [here](https://arxiv.org/abs/2505.20411))! For how to run it, checkout [this README](./SWE-rebench.md).**
|
||||
|
||||
**UPDATE (6/15/2025): We now support running SWE-bench-Live evaluation (see the paper [here](https://arxiv.org/abs/2505.23419))! For how to run it, checkout [this README](./SWE-bench-Live.md).**
|
||||
|
||||
**UPDATE (5/26/2025): We now support running interactive SWE-Bench evaluation (see the paper [here](https://arxiv.org/abs/2502.13069))! For how to run it, checkout [this README](./SWE-Interact.md).**
|
||||
|
||||
84
evaluation/benchmarks/swe_bench/SWE-rebench.md
Normal file
84
evaluation/benchmarks/swe_bench/SWE-rebench.md
Normal file
@ -0,0 +1,84 @@
|
||||
# SWE-rebench
|
||||
|
||||
<p align="center">
|
||||
<a href="https://arxiv.org/abs/2505.20411">📃 Paper</a>
|
||||
•
|
||||
<a href="https://huggingface.co/datasets/nebius/SWE-rebench">🤗 HuggingFace</a>
|
||||
•
|
||||
<a href="https://swe-rebench.com/leaderboard">📊 Leaderboard</a>
|
||||
</p>
|
||||
|
||||
SWE-rebench is a large-scale dataset for verifiable software engineering tasks.
|
||||
It comes in **two datasets**:
|
||||
|
||||
* **[`nebius/SWE-rebench-leaderboard`](https://huggingface.co/datasets/nebius/SWE-rebench-leaderboard)** – updatable benchmark used for [leaderboard evaluation](https://swe-rebench.com/leaderboard).
|
||||
* **[`nebius/SWE-rebench`](https://huggingface.co/datasets/nebius/SWE-rebench)** – full dataset with **21,302 tasks**, suitable for training or large-scale offline evaluation.
|
||||
|
||||
This document explains how to run OpenHands on SWE-rebench, using the leaderboard split as the main example.
|
||||
To run on the full dataset, simply replace the dataset name.
|
||||
|
||||
|
||||
## Setting Up
|
||||
|
||||
Set up your development environment and configure your LLM provider by following the [SWE-bench README](README.md) in this directory.
|
||||
|
||||
|
||||
## Running Inference
|
||||
|
||||
Use the existing SWE-bench inference script, changing the dataset to `nebius/SWE-rebench-leaderboard` and selecting the split (`test` for leaderboard submission):
|
||||
|
||||
```bash
|
||||
./evaluation/benchmarks/swe_bench/scripts/run_infer.sh \
|
||||
llm.your_llm HEAD CodeActAgent 30 50 1 nebius/SWE-rebench-leaderboard test
|
||||
```
|
||||
|
||||
Arguments:
|
||||
|
||||
* `llm.your_llm` – your model configuration key
|
||||
* `HEAD` – commit reference for reproducibility
|
||||
* `CodeActAgent` – agent type
|
||||
* `10` – number of examples to evaluate
|
||||
* `50` – maximum iterations per task (increase if needed)
|
||||
* `1` – number of workers
|
||||
* `nebius/SWE-rebench-leaderboard` – Hugging Face dataset name
|
||||
* `test` – dataset split
|
||||
|
||||
**Tip:** To run on the **full 21k dataset**, replace `nebius/SWE-rebench-leaderboard` with `nebius/SWE-rebench`.
|
||||
|
||||
|
||||
## Evaluating Results
|
||||
|
||||
After inference completes, evaluate using the [SWE-bench-fork evaluation harness](https://github.com/SWE-rebench/SWE-bench-fork).
|
||||
|
||||
1. Convert the OpenHands output to SWE-bench evaluation format:
|
||||
|
||||
```bash
|
||||
python evaluation/benchmarks/swe_bench/scripts/live/convert.py \
|
||||
--output_jsonl path/to/evaluation/output.jsonl > preds.jsonl
|
||||
```
|
||||
|
||||
2. Clone the SWE-bench-fork repo (https://github.com/SWE-rebench/SWE-bench-fork) and follow its README to install dependencies.
|
||||
|
||||
|
||||
3. Run the evaluation using the fork:
|
||||
|
||||
```bash
|
||||
python -m swebench.harness.run_evaluation \
|
||||
--dataset_name nebius/SWE-rebench-leaderboard \
|
||||
--split test \
|
||||
--predictions_path preds.jsonl \
|
||||
--max_workers 10 \
|
||||
--run_id openhands
|
||||
```
|
||||
|
||||
|
||||
## Citation
|
||||
|
||||
```bibtex
|
||||
@article{badertdinov2025swerebench,
|
||||
title={SWE-rebench: An Automated Pipeline for Task Collection and Decontaminated Evaluation of Software Engineering Agents},
|
||||
author={Badertdinov, Ibragim and Golubev, Alexander and Nekrashevich, Maksim and Shevtsov, Anton and Karasik, Simon and Andriushchenko, Andrei and Trofimova, Maria and Litvintseva, Daria and Yangel, Boris},
|
||||
journal={arXiv preprint arXiv:2505.20411},
|
||||
year={2025}
|
||||
}
|
||||
```
|
||||
@ -80,6 +80,8 @@ def set_dataset_type(dataset_name: str) -> str:
|
||||
DATASET_TYPE = 'SWE-Gym'
|
||||
elif 'swe-bench-live' in name_lower:
|
||||
DATASET_TYPE = 'SWE-bench-Live'
|
||||
elif 'swe-rebench' in name_lower:
|
||||
DATASET_TYPE = 'SWE-rebench'
|
||||
elif 'multimodal' in name_lower:
|
||||
DATASET_TYPE = 'Multimodal'
|
||||
else:
|
||||
@ -178,6 +180,8 @@ def get_instance_docker_image(
|
||||
docker_image_prefix = 'docker.io/starryzhang/'
|
||||
elif DATASET_TYPE == 'SWE-bench':
|
||||
docker_image_prefix = 'docker.io/swebench/'
|
||||
elif DATASET_TYPE == 'SWE-rebench':
|
||||
docker_image_prefix = 'docker.io/swerebench/'
|
||||
repo, name = instance_id.split('__')
|
||||
image_name = f'{docker_image_prefix.rstrip("/")}/sweb.eval.x86_64.{repo}_1776_{name}:latest'.lower()
|
||||
logger.debug(f'Using official SWE-Bench image: {image_name}')
|
||||
@ -318,6 +322,8 @@ def initialize_runtime(
|
||||
# inject the instance swe entry
|
||||
if DATASET_TYPE == 'SWE-bench-Live':
|
||||
entry_script_path = 'instance_swe_entry_live.sh'
|
||||
elif DATASET_TYPE == 'SWE-rebench':
|
||||
entry_script_path = 'instance_swe_entry_rebench.sh'
|
||||
else:
|
||||
entry_script_path = 'instance_swe_entry.sh'
|
||||
runtime.copy_to(
|
||||
|
||||
@ -0,0 +1,45 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
source ~/.bashrc
|
||||
SWEUTIL_DIR=/swe_util
|
||||
|
||||
# FIXME: Cannot read SWE_INSTANCE_ID from the environment variable
|
||||
# SWE_INSTANCE_ID=django__django-11099
|
||||
if [ -z "$SWE_INSTANCE_ID" ]; then
|
||||
echo "Error: SWE_INSTANCE_ID is not set." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Read the swe-bench-test-lite.json file and extract the required item based on instance_id
|
||||
item=$(jq --arg INSTANCE_ID "$SWE_INSTANCE_ID" '.[] | select(.instance_id == $INSTANCE_ID)' $SWEUTIL_DIR/eval_data/instances/swe-bench-instance.json)
|
||||
|
||||
if [[ -z "$item" ]]; then
|
||||
echo "No item found for the provided instance ID."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
WORKSPACE_NAME=$(echo "$item" | jq -r '(.repo | tostring) + "__" + (.version | tostring) | gsub("/"; "__")')
|
||||
|
||||
echo "WORKSPACE_NAME: $WORKSPACE_NAME"
|
||||
|
||||
# Clear the workspace
|
||||
if [ -d /workspace ]; then
|
||||
rm -rf /workspace/*
|
||||
else
|
||||
mkdir /workspace
|
||||
fi
|
||||
# Copy repo to workspace
|
||||
if [ -d /workspace/$WORKSPACE_NAME ]; then
|
||||
rm -rf /workspace/$WORKSPACE_NAME
|
||||
fi
|
||||
mkdir -p /workspace
|
||||
cp -r /testbed /workspace/$WORKSPACE_NAME
|
||||
|
||||
# Activate instance-specific environment
|
||||
if [ -d /opt/miniconda3 ]; then
|
||||
. /opt/miniconda3/etc/profile.d/conda.sh
|
||||
conda activate testbed
|
||||
fi
|
||||
|
||||
export PATH=/opt/conda/envs/testbed/bin:$PATH
|
||||
@ -263,19 +263,20 @@ def prepare_dataset(
|
||||
f'Randomly sampling {eval_n_limit} unique instances with random seed 42.'
|
||||
)
|
||||
|
||||
def make_serializable(instance: pd.Series) -> dict:
|
||||
def make_serializable(instance_dict: dict) -> dict:
|
||||
import numpy as np
|
||||
|
||||
instance_dict = instance.to_dict()
|
||||
for k, v in instance_dict.items():
|
||||
if isinstance(v, np.ndarray):
|
||||
instance_dict[k] = v.tolist()
|
||||
elif isinstance(v, pd.Timestamp):
|
||||
instance_dict[k] = str(v)
|
||||
elif isinstance(v, dict):
|
||||
instance_dict[k] = make_serializable(v)
|
||||
return instance_dict
|
||||
|
||||
new_dataset = [
|
||||
make_serializable(instance)
|
||||
make_serializable(instance.to_dict())
|
||||
for _, instance in dataset.iterrows()
|
||||
if str(instance[id_column]) not in finished_ids
|
||||
]
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user