diff --git a/.gitignore b/.gitignore index bc1a7db343..89a4102434 100644 --- a/.gitignore +++ b/.gitignore @@ -160,6 +160,9 @@ cython_debug/ .idea/ .vscode/ +# evaluation +evaluation/SWE-bench/data + # frontend # dependencies diff --git a/evaluation/README.md b/evaluation/README.md index 06b0fc1532..966559d9a7 100644 --- a/evaluation/README.md +++ b/evaluation/README.md @@ -19,4 +19,6 @@ all the preprocessing/evaluation/analysis scripts. - resources - Devin's outputs processed for evaluations is available on [Huggingface](https://huggingface.co/datasets/OpenDevin/Devin-SWE-bench-output) - get predictions that passed the test: `wget https://huggingface.co/datasets/OpenDevin/Devin-SWE-bench-output/raw/main/devin_swe_passed.json` - - get all predictions`wget https://huggingface.co/datasets/OpenDevin/Devin-SWE-bench-output/raw/main/devin_swe_outputs.json` + - get all predictions `wget https://huggingface.co/datasets/OpenDevin/Devin-SWE-bench-output/raw/main/devin_swe_outputs.json` + +See [`SWE-bench/README.md`](./SWE-bench/README.md) for more details on how to run SWE-Bench for evaluation. diff --git a/evaluation/SWE-bench/Dockerfile b/evaluation/SWE-bench/Dockerfile new file mode 100644 index 0000000000..aabcee88fc --- /dev/null +++ b/evaluation/SWE-bench/Dockerfile @@ -0,0 +1,39 @@ +FROM ubuntu:20.04 + +# https://github.com/princeton-nlp/SWE-bench/issues/15#issuecomment-1815392192 +RUN apt-get update && \ + apt-get install -y bash gcc git jq wget && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN git config --global user.email "swebench@pnlp.org" +RUN git config --global user.name "swebench" + +RUN apt update && apt install -y build-essential + +# Create new user +RUN useradd -ms /bin/bash swe-bench +USER swe-bench +WORKDIR /home/swe-bench + +# Setup Conda +ENV PATH="/home/swe-bench/miniconda3/bin:${PATH}" +ARG PATH="/home/swe-bench/miniconda3/bin:${PATH}" +RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh \ + && mkdir ~/.conda \ + && bash miniconda.sh -b \ + && rm -f miniconda.sh +RUN conda --version + +# Setup SWE-Bench Env +COPY environment.yml . +RUN conda env create -f environment.yml + +# Some missing packages +RUN pip install datasets python-dotenv gitpython + +RUN conda init bash + +CMD ["/bin/bash"] +# docker build -t opendevin/eval-swe-bench:v0.1 -f evaluation/swe-bench/Dockerfile evaluation/swe-bench/ +# docker push opendevin/eval-swe-bench:v0.1 diff --git a/evaluation/SWE-bench/README.md b/evaluation/SWE-bench/README.md new file mode 100644 index 0000000000..539858463d --- /dev/null +++ b/evaluation/SWE-bench/README.md @@ -0,0 +1,79 @@ +# SWE-Bench Evaluation + +Work in-progress. + +**TODOs**: + +- [ ] Generate `predictions` files given an OpenDevin `Agent` implementation. We could borrow something from [devin's eval-harness implementation](https://github.com/CognitionAI/devin-swebench-results/tree/main/harness), for example, [how to generate `TestSpec`](https://github.com/CognitionAI/devin-swebench-results/blob/main/harness/scripts.py#L150-L160). +- [ ] Make sure the evaluation suite runs on all repos. I only tested on `matplotlib` so far, `scikit-learn` does not work for now (see [this issue](https://github.com/princeton-nlp/SWE-bench/issues/57))). + + +## Run tests for a prediction file inside a docker container + +Currently, the docker container should be able to for running SWE-Bench. It was tested on `matplotlib`, but it requires further testing to make sure it works on other repositories. Currently, [it does not work for `scikit-learn`](https://github.com/princeton-nlp/SWE-bench/issues/57)). + +### Setup example data + +```bash +cd evaluation/SWE-bench +./scripts/prepare_devin_swe_bench_data.sh + +# Clone the repo +# This is a fork that fixes some issues that stops matplotlib from running (see https://github.com/princeton-nlp/SWE-bench/pull/56) +git clone https://github.com/xingyaoww/SWE-bench.git + +# Enter the docker container +./scripts/run_docker_interactive.sh +``` + +### Run evaluation + +```bash +#!/bin/bash +mkdir -p data/logs +mkdir -p data/testbeds + +python SWE-bench/harness/run_evaluation.py \ + --predictions_path data/predictions/devin_swe_outputs.json \ + --swe_bench_tasks data/processed/swe-bench-test.json \ + --log_dir data/logs \ + --testbed data/testbeds \ + --skip_existing \ + --timeout 900 \ + --verbose +``` + +You will see the command line outputs similar to this (if success): + +```log +swe-bench@2f3a6b9fcab2:/swe-bench$ ./harness/run_evaluation.sh +/swe-bench/harness/run_evaluation.py:101: SyntaxWarning: assertion is always true, perhaps remove parentheses? + assert(temp, datasets.arrow_dataset.Dataset) +2024-03-20 09:21:18,796 - INFO - Found 1 predictions across 1 model(s) in predictions file +2024-03-20 09:21:18,796 - INFO - [claude-2/matplotlib__matplotlib/3.6] # of predictions to evaluate: 1 (0 already evaluated) +2024-03-20 09:21:18,797 - INFO - [Testbed] Creating log directory /swe-bench/data/logs/claude-2 +2024-03-20 09:21:18,797 - INFO - [Testbed] Using conda path /swe-bench/data/testbeds/claude-2/matplotlib__matplotlib/3.6/tmp09wrm708 +2024-03-20 09:21:18,797 - INFO - [Testbed] Using working directory /swe-bench/data/testbeds/claude-2/matplotlib__matplotlib/3.6/tmpfy1qth23 for testbed +2024-03-20 09:21:18,797 - INFO - [Testbed] Repo matplotlib/matplotlib: 1 versions +2024-03-20 09:21:18,797 - INFO - [Testbed] Version 3.6: 1 instances +2024-03-20 09:21:18,797 - INFO - No conda path provided, creating temporary install in /swe-bench/data/testbeds/claude-2/matplotlib__matplotlib/3.6/tmp09wrm708/miniconda3... +2024-03-20 09:21:27,482 - INFO - [Testbed] Using conda path /swe-bench/data/testbeds/claude-2/matplotlib__matplotlib/3.6/tmp09wrm708/miniconda3 +2024-03-20 09:21:27,942 - INFO - [Testbed] Setting up testbed for matplotlib__matplotlib__3.6 +2024-03-20 09:21:44,257 - INFO - [Testbed] Cloned matplotlib/matplotlib to /swe-bench/data/testbeds/claude-2/matplotlib__matplotlib/3.6/tmpfy1qth23/matplotlib__matplotlib__3.6 +2024-03-20 09:21:44,415 - INFO - [Testbed] Creating environment matplotlib__matplotlib__3.6; Command: /swe-bench/data/testbeds/claude-2/matplotlib__matplotlib/3.6/tmp09wrm708/miniconda3/bin/conda env create --file /swe-bench/data/testbeds/claude-2/matplotlib__matplotlib/3.6/tmpfy1qth23/environment.yml +2024-03-20 09:23:39,781 - INFO - [Testbed] Installing pip packages for matplotlib__matplotlib__3.6; Command: . /swe-bench/data/testbeds/claude-2/matplotlib__matplotlib/3.6/tmp09wrm708/miniconda3/bin/activate matplotlib__matplotlib__3.6 && pip install pytest +/swe-bench/data/testbeds/claude-2/matplotlib__matplotlib/3.6/tmpfy1qth23/matplotlib__matplotlib__3.6: 1 instances +2024-03-20 09:23:42,309 - INFO - [matplotlib__matplotlib__3.6] [matplotlib__matplotlib-24362] Reset task environment to aca6e9d5e98811ca37c442217914b15e78127c89 +2024-03-20 09:23:42,314 - INFO - [matplotlib__matplotlib__3.6] [matplotlib__matplotlib-24362] Apply patch successful (pred_try) +2024-03-20 09:23:42,318 - INFO - [matplotlib__matplotlib__3.6] [matplotlib__matplotlib-24362] Revert patch successful (pred_try) +2024-03-20 09:23:42,318 - INFO - [matplotlib__matplotlib__3.6] [matplotlib__matplotlib-24362] Installing with command: . /swe-bench/data/testbeds/claude-2/matplotlib__matplotlib/3.6/tmp09wrm708/miniconda3/bin/activate matplotlib__matplotlib__3.6 && echo 'activate successful' && python -m pip install -e . +2024-03-20 09:24:54,966 - INFO - [matplotlib__matplotlib__3.6] [matplotlib__matplotlib-24362] Installation successful +2024-03-20 09:24:54,970 - INFO - [matplotlib__matplotlib__3.6] [matplotlib__matplotlib-24362] Apply patch successful (test) +2024-03-20 09:24:54,974 - INFO - [matplotlib__matplotlib__3.6] [matplotlib__matplotlib-24362] Apply patch successful (pred) +2024-03-20 09:25:04,775 - INFO - [matplotlib__matplotlib__3.6] [matplotlib__matplotlib-24362] Test script run successful +swe-bench@2f3a6b9fcab2:/swe-bench$ +``` + +### Interpret Results + +Then you may interpret the results under `data/logs`, and interpret it following [this guide](https://github.com/princeton-nlp/SWE-bench/blob/main/tutorials/evaluation.md#-metrics). diff --git a/evaluation/SWE-bench/environment.yml b/evaluation/SWE-bench/environment.yml new file mode 100644 index 0000000000..302fc856bf --- /dev/null +++ b/evaluation/SWE-bench/environment.yml @@ -0,0 +1,15 @@ +# FROM https://github.com/princeton-nlp/SWE-bench/blob/main/environment.yml +name: swe-bench +dependencies: + - python=3.9 + - pip + - pip: + - beautifulsoup4 + - chardet + - ghapi + - GitPython + - python-dotenv + - requests + - rich + - transformers>=4.34.0 + - conda-forge::gh diff --git a/evaluation/SWE-bench/scripts/download_test_data.py b/evaluation/SWE-bench/scripts/download_test_data.py new file mode 100644 index 0000000000..f664f0c2ac --- /dev/null +++ b/evaluation/SWE-bench/scripts/download_test_data.py @@ -0,0 +1,6 @@ +from datasets import load_dataset +import pandas as pd + +dataset = load_dataset("princeton-nlp/SWE-bench") +test = dataset["test"].to_pandas() +test.to_json("data/processed/swe-bench-test.json", orient="records") diff --git a/evaluation/SWE-bench/scripts/prepare_devin_swe_bench_data.sh b/evaluation/SWE-bench/scripts/prepare_devin_swe_bench_data.sh new file mode 100755 index 0000000000..ef2c76f462 --- /dev/null +++ b/evaluation/SWE-bench/scripts/prepare_devin_swe_bench_data.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +set -xeo pipefail +mkdir -p data/processed +python3 scripts/download_test_data.py + +# Download an example output file (FROM claude-2) +# https://gist.github.com/sorendunn/9f1f1fade59f986b4925b6633f9ff165 +mkdir -p data/predictions +wget https://huggingface.co/datasets/OpenDevin/Devin-SWE-bench-output/raw/main/devin_swe_outputs.json -O data/predictions/devin_swe_outputs.json diff --git a/evaluation/SWE-bench/scripts/run_docker_interactive.sh b/evaluation/SWE-bench/scripts/run_docker_interactive.sh new file mode 100755 index 0000000000..4407884e45 --- /dev/null +++ b/evaluation/SWE-bench/scripts/run_docker_interactive.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +DOCKER_IMAGE=opendevin/eval-swe-bench:v0.1 +WORK_DIR=`pwd` + +docker run \ + -it \ + --rm \ + --user root \ + --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \ + -v $WORK_DIR:/swe-bench \ + -w /swe-bench \ + $DOCKER_IMAGE \ + /bin/bash -c "usermod -u $(id -u) swe-bench && su swe-bench" diff --git a/requirements.txt b/requirements.txt index 700ec2d821..b1f2293da7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,4 @@ +datasets +pandas litellm -termcolor +termcolor \ No newline at end of file