Add SWEBench-docker eval (#2085)

* add initial version of swebench-docker eval * update the branch of git repo * add poetry run * download dev set too and pre-load f2p and p2p * update eval infer script * increase timeout * add poetry run * install swebench from our fork * update script * update loc * support single instance debug * replace \r\n from model patch * replace eval docker from namespace xingyaoww * update script to auto detect swe-bench format jsonl * support eval infer on single instance id * change log output dir to logs * update summarise result script * update README * update readme * tweak branch * Update evaluation/swe_bench/scripts/eval/prep_eval.sh Co-authored-by: Graham Neubig <neubig@gmail.com> --------- Co-authored-by: Graham Neubig <neubig@gmail.com>
2025-12-26 05:48:36 +08:00 · 2024-06-11 03:30:40 +08:00 · 2024-06-11 03:30:40 +08:00 · a6ba6c5277
commit a6ba6c5277
parent 9605106e72
10 changed files with 270 additions and 346 deletions
--- a/evaluation/swe_bench/EVAL_PATCH.md
+++ b/evaluation/swe_bench/EVAL_PATCH.md
@ -1,256 +0,0 @@
-# Evaluate Generated Patches
-
-## Evaluate patches generated by OpenDevin
-
-This section explains in detail how `evaluation/swe_bench/scripts/eval_infer.sh` described in [SWE-Bench README](./README.md) works.
-
-Use `scripts/setup/get_agent_report.sh` to evaluate patches generated by an OpenDevin agent. This script is available in the container at `/swe_util/get_agent_report.sh`.
-
- `output-file` (*required*): specify the path to your patch file inside the container
- `agent-name` (*required*): your agent name
- `dataset` (*required*): `swe-bench-test-lite` or `swe-bench-test`
- `num-processes`: defaults to 15.
- `experiment-name`: set to `${parent_folder_of_output_fils}_${current_folder_of_output_file}` if not given. E.g., `xxx/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v2_cd/output.jsonl` -> `CodeActAgent_gpt-4-1106-preview_maxiter_50_N_v2_cd` as experiment name.
- `merge_report`: if set, merges the evaluation report into the original output jsonl file and saves as a `.merged.jsonl` file.
-
-An example to run evaluation on the given example agent output (`./examples/example_agent_output.json`).
-
-```shell
-export MINICONDA3=/swe_util/miniforge3
-export OD_SWE_BENCH=/OD-SWE-bench
-export EVAL_DATA_DIR=/swe_util/eval_data
-cd /swe_util && ./get_agent_report.sh --output-file /swe_bench_output/example_agent_output.jsonl \
--agent-name CodeActAgent \
--dataset swe-bench-test-lite \
--experiment-name test_experiment \
--merge-report
-```
-
-You should get the following report:
-```shell
- no_generation: 4
- generated: 26
- with_logs: 26
- install_fail: 0
- reset_failed: 0
- no_apply: 0
- applied: 24
- test_errored: 0
- test_timeout: 0
- resolved: 6
-['sphinx-doc__sphinx-8721', 'sympy__sympy-14774', 'django__django-17087', 'sympy__sympy-20590', 'django__django-11583', 'sympy__sympy-21612']
-Report saved at /swe_util/eval_data/eval_logs/test_experiment/test_experiment_swe-bench-test-lite.report.json
-Agent output with report merged created at /swe_bench_output/example_agent_output.merged.jsonl
-```
-
-An additional `fine_grained_report` field will be added to each instance in the `example_agent_output.merged.jsonl`.
-
-```json
-"fine_grained_report": {
-  "gold_tests": {
-    "FAIL_TO_PASS": "[\"tests/test_ext_viewcode.py::test_viewcode_epub_default\"]",
-    "PASS_TO_PASS": "[\"tests/test_ext_viewcode.py::test_viewcode_epub_enabled\", \"tests/test_ext_viewcode.py::test_linkcode\", \"tests/test_ext_viewcode.py::test_local_source_files\"]"
-  },
-  "generated": true,
-  "with_logs": true,
-  "applied": true,
-  "test_errored": false,
-  "test_timeout": false,
-  "resolved": true,
-  "log_parse": {
-    "tests/test_ext_viewcode.py::test_viewcode_epub_default": "PASSED",
-    "tests/test_ext_viewcode.py::test_viewcode_epub_enabled": "PASSED",
-    "tests/test_ext_viewcode.py::test_linkcode": "PASSED",
-    "tests/test_ext_viewcode.py::test_local_source_files": "PASSED",
-    "tests/test_ext_viewcode.py::test_viewcode": "FAILED"
-  },
-  "eval_report": {
-    "FAIL_TO_PASS": {
-      "success": [
-        "tests/test_ext_viewcode.py::test_viewcode_epub_default"
-      ],
-      "failure": []
-    },
-    "PASS_TO_PASS": {
-      "success": [
-        "tests/test_ext_viewcode.py::test_viewcode_epub_enabled",
-        "tests/test_ext_viewcode.py::test_linkcode",
-        "tests/test_ext_viewcode.py::test_local_source_files"
-      ],
-      "failure": []
-    },
-    "FAIL_TO_FAIL": {
-      "success": [],
-      "failure": []
-    },
-    "PASS_TO_FAIL": {
-      "success": [],
-      "failure": []
-    }
-  }
-}
-```
-
-## If you already have patches not generated by OpenDevin
-
-### Prepare Output Files
-
-Ensure that model outputs are formatted correctly as below:
-```json
-[
-  {
-    "instance_id": "",
-    "model_patch": "",
-    "model_name_or_path": ""
-  },
-  ...
-]
-```
-An example can be found [here](./examples/example_model_output.json).
-
-Agent output should be adhere to the OpenDevin format. An example can be found [here](./examples/example_agent_output.json).
-
-### Set Up the Environment
-
-Before evaluating generated patches, you need to set up the Docker environment. Run the following command to instantiate the Docker container and mount the directory to your output files on the host:
-
-```shell
-docker run -it \
-v DIR_TO_YOUR_PATCH_FILES_ON_HOST:/swe_bench_output \
-ghcr.io/opendevin/eval-swe-bench:full-v1.2.1 /bin/bash
-```
-
-### Evaluate Model Generated Patches
-
-Use `scripts/get_model_report.sh` to evaluate patches generated by a model. This script is located in the container at `/swe_util/get_model_report.sh`.
-
- `output-file` (*required*): specify the path to your patch file inside the container
- `model-name` (*required*): this must match the `model_name_or_path` in your patch file
- `dataset` (*required*): `swe-bench-test-lite` or `swe-bench-test`
- `num-processes`: defaults to 15.
- `experiment-name`: set to `{model-name}__{dataset}` unless specified
-
-An example to run evaluation on the given example model output (`./examples/example_agent_output.json`).
-
-```shell
-export MINICONDA3=/swe_util/miniforge3
-export OD_SWE_BENCH=/swe_util/OD-SWE-bench
-export EVAL_DATA_DIR=/swe_util/eval_data
-cd /swe_util && ./get_model_report.sh --output-file /swe_bench_output/example_model_output.json \
--model-name opendevin \
--dataset swe-bench-test-lite
-```
-
-You should get the following report:
-```shell
- no_generation: 4
- generated: 26
- with_logs: 26
- install_fail: 0
- reset_failed: 0
- no_apply: 0
- applied: 24
- test_errored: 0
- test_timeout: 0
- resolved: 6
-['sphinx-doc__sphinx-8721', 'sympy__sympy-14774', 'django__django-17087', 'sympy__sympy-20590', 'django__django-11583', 'sympy__sympy-21612']
-Report saved at /swe_util/eval_data/eval_logs/opendevin__swe-bench-test-lite/example_model_output.report.json
-```
-Note: please ignore the `no_apply` in the report for now.
-
-The script will generate a `{experiment_name}` folder under `$EVAL_DATA_DIR/eval_logs`
-```shell
-├── $EVAL_DATA_DIR/eval_logs/$experiment_name
-│   ├── $experiment_name.json
-│   ├── $experiment_name.report.json
-│   ├── $model_name # eval log dir
-```
-
-### Evaluate Agent Generated Patches
-
-Use `scripts/setup/get_agent_report.sh` to evaluate patches generated by an agent. This script is available in the container at `/swe_util/get_agent_report.sh`.
-
- `output-file` (*required*): specify the path to your patch file inside the container
- `agent-name` (*required*): your agent name
- `dataset` (*required*): `swe-bench-test-lite` or `swe-bench-test`
- `num-processes`: defaults to 15.
- `experiment-name`: set to `${parent_folder_of_output_fils}_${current_folder_of_output_file}` if not given. E.g., `xxx/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v2_cd/output.jsonl` -> `CodeActAgent_gpt-4-1106-preview_maxiter_50_N_v2_cd` as experiment name.
- `merge_report`: if set, merges the evaluation report into the original output jsonl file and saves as a `.merged.jsonl` file.
-
-An example to run evaluation on the given example agent output (`./examples/example_agent_output.json`).
-
-```shell
-export MINICONDA3=/swe_util/miniforge3
-export OD_SWE_BENCH=/OD-SWE-bench
-export EVAL_DATA_DIR=/swe_util/eval_data
-cd /swe_util && ./get_agent_report.sh --output-file /swe_bench_output/example_agent_output.jsonl \
--agent-name CodeActAgent \
--dataset swe-bench-test-lite \
--experiment-name test_experiment \
--merge-report
-```
-
-You should get the following report:
-```shell
- no_generation: 4
- generated: 26
- with_logs: 26
- install_fail: 0
- reset_failed: 0
- no_apply: 0
- applied: 24
- test_errored: 0
- test_timeout: 0
- resolved: 6
-['sphinx-doc__sphinx-8721', 'sympy__sympy-14774', 'django__django-17087', 'sympy__sympy-20590', 'django__django-11583', 'sympy__sympy-21612']
-Report saved at /swe_util/eval_data/eval_logs/test_experiment/test_experiment_swe-bench-test-lite.report.json
-Agent output with report merged created at /swe_bench_output/example_agent_output.merged.jsonl
-```
-
-An additional `fine_grained_report` field will be added to each instance in the `example_agent_output.merged.jsonl`.
-
-```json
-"fine_grained_report": {
-  "gold_tests": {
-    "FAIL_TO_PASS": "[\"tests/test_ext_viewcode.py::test_viewcode_epub_default\"]",
-    "PASS_TO_PASS": "[\"tests/test_ext_viewcode.py::test_viewcode_epub_enabled\", \"tests/test_ext_viewcode.py::test_linkcode\", \"tests/test_ext_viewcode.py::test_local_source_files\"]"
-  },
-  "generated": true,
-  "with_logs": true,
-  "applied": true,
-  "test_errored": false,
-  "test_timeout": false,
-  "resolved": true,
-  "log_parse": {
-    "tests/test_ext_viewcode.py::test_viewcode_epub_default": "PASSED",
-    "tests/test_ext_viewcode.py::test_viewcode_epub_enabled": "PASSED",
-    "tests/test_ext_viewcode.py::test_linkcode": "PASSED",
-    "tests/test_ext_viewcode.py::test_local_source_files": "PASSED",
-    "tests/test_ext_viewcode.py::test_viewcode": "FAILED"
-  },
-  "eval_report": {
-    "FAIL_TO_PASS": {
-      "success": [
-        "tests/test_ext_viewcode.py::test_viewcode_epub_default"
-      ],
-      "failure": []
-    },
-    "PASS_TO_PASS": {
-      "success": [
-        "tests/test_ext_viewcode.py::test_viewcode_epub_enabled",
-        "tests/test_ext_viewcode.py::test_linkcode",
-        "tests/test_ext_viewcode.py::test_local_source_files"
-      ],
-      "failure": []
-    },
-    "FAIL_TO_FAIL": {
-      "success": [],
-      "failure": []
-    },
-    "PASS_TO_FAIL": {
-      "success": [],
-      "failure": []
-    }
-  }
-}
-```
--- a/evaluation/swe_bench/README.md
+++ b/evaluation/swe_bench/README.md
@ -127,6 +127,12 @@ If you want to evaluate existing results, you should first run this to clone exi
 git clone https://huggingface.co/spaces/OpenDevin/evaluation evaluation/evaluation_outputs
 ```

+To prepare for swe-bench evaluation, you should pull evaluation docker from [OpenDevin/SWE-bench-docker](https://github.com/OpenDevin/SWE-bench-docker) and download swe-bench data by running:
+
+```bash
+evaluation/swe_bench/scripts/eval/prep_eval.sh
+```
+
 Then you can run the following:

 ```bash
@ -135,55 +141,14 @@ Then you can run the following:
 ./evaluation/swe_bench/scripts/eval_infer.sh evaluation/evaluation_outputs/outputs/swe_bench/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v1.0/output.jsonl
 ```

-The final results will be saved to `evaluation/evaluation_outputs/outputs/swe_bench/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v1.0/output.merged.jsonl`.
+PS: You can also pass in a JSONL with [SWE-Bench format](https://github.com/princeton-nlp/SWE-bench/blob/main/tutorials/evaluation.md#-creating-predictions) to `./evaluation/swe_bench/scripts/eval_infer.sh`, where each line is a JSON of `{"model_patch": "XXX", "model_name_or_path": "YYY", "instance_id": "ZZZ"}`.

-It will contain an additional field `fine_grained_report` (see example below) compared to the `output.jsonl` from the previous inference stage.
+The final results will be saved to `evaluation/evaluation_outputs/outputs/swe_bench/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v1.0/` with the following files/directory (following format of [SWE-bench-docker](https://github.com/aorwall/SWE-bench-docker/tree/main/evaluations/SWE-bench_Lite_golden)):

-```json
-"fine_grained_report": {
-  "gold_tests": {
-    "FAIL_TO_PASS": "[\"tests/test_ext_viewcode.py::test_viewcode_epub_default\"]",
-    "PASS_TO_PASS": "[\"tests/test_ext_viewcode.py::test_viewcode_epub_enabled\", \"tests/test_ext_viewcode.py::test_linkcode\", \"tests/test_ext_viewcode.py::test_local_source_files\"]"
-  },
-  "generated": true,
-  "with_logs": true,
-  "applied": true,
-  "test_errored": false,
-  "test_timeout": false,
-  "resolved": true,
-  "log_parse": {
-    "tests/test_ext_viewcode.py::test_viewcode_epub_default": "PASSED",
-    "tests/test_ext_viewcode.py::test_viewcode_epub_enabled": "PASSED",
-    "tests/test_ext_viewcode.py::test_linkcode": "PASSED",
-    "tests/test_ext_viewcode.py::test_local_source_files": "PASSED",
-    "tests/test_ext_viewcode.py::test_viewcode": "FAILED"
-  },
-  "eval_report": {
-    "FAIL_TO_PASS": {
-      "success": [
-        "tests/test_ext_viewcode.py::test_viewcode_epub_default"
-      ],
-      "failure": []
-    },
-    "PASS_TO_PASS": {
-      "success": [
-        "tests/test_ext_viewcode.py::test_viewcode_epub_enabled",
-        "tests/test_ext_viewcode.py::test_linkcode",
-        "tests/test_ext_viewcode.py::test_local_source_files"
-      ],
-      "failure": []
-    },
-    "FAIL_TO_FAIL": {
-      "success": [],
-      "failure": []
-    },
-    "PASS_TO_FAIL": {
-      "success": [],
-      "failure": []
-    }
-  }
-}
-```
+- `README.md`: a report showing what are the instances that passed, failed, etc.
+- `logs/`: a directory of test logs
+- `report.json`: a JSON file that contains keys like `"resolved"` pointing to instance IDs that are resolved by the agent.
+- `summary.json`: a JSON file contains more fine-grained information for each test instance.

 Please refer to [EVAL_PATCH.md](./EVAL_PATCH.md) if you want to learn more about how to evaluate patches that are already generated (e.g., not by OpenDevin).

@ -192,8 +157,8 @@ Please refer to [EVAL_PATCH.md](./EVAL_PATCH.md) if you want to learn more about
 If you just want to know the resolve rate, and/or a summary of what tests pass and what don't, you could run

 ```bash
-poetry run python ./evaluation/swe_bench/scripts/summarise_results.py <path_to_output_merged_jsonl_file>
-# e.g. poetry run python ./evaluation/swe_bench/scripts/summarise_results.py ./evaluation/evaluation_outputs/outputs/swe_bench_lite/CodeActSWEAgent/gpt-4o-2024-05-13_maxiter_50_N_v1.5-no-hint/output.merged.jsonl
+poetry run python ./evaluation/swe_bench/scripts/summarise_results.py <path_to_report_json_file>
+# e.g. poetry run python ./evaluation/swe_bench/scripts/summarise_results.py ./evaluation/evaluation_outputs/outputs/swe_bench_lite/CodeActSWEAgent/gpt-4o-2024-05-13_maxiter_50_N_v1.5-no-hint/report.json
 ```

 ## Submit your evaluation results
--- a/evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh
+++ b/evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh
@ -0,0 +1,7 @@
+#!/bin/bash
+
+mkdir evaluation/swe_bench/eval_workspace
+pushd evaluation/swe_bench/eval_workspace
+git clone https://github.com/OpenDevin/SWE-bench-docker.git
+cd SWE-bench-docker
+scripts/pull_docker_images.sh docker/ xingyaoww
--- a/evaluation/swe_bench/scripts/eval/convert_od_output_to_swe_json.py
+++ b/evaluation/swe_bench/scripts/eval/convert_od_output_to_swe_json.py
@ -0,0 +1,26 @@
+import argparse
+import os
+
+import pandas as pd
+
+parser = argparse.ArgumentParser()
+parser.add_argument('od_output_file', type=str)
+args = parser.parse_args()
+output_filepath = args.od_output_file.replace('.jsonl', '.swebench.jsonl')
+print(f'Converting {args.od_output_file} to {output_filepath}')
+
+od_format = pd.read_json(args.od_output_file, orient='records', lines=True)
+# model name is the folder name of od_output_file
+model_name = os.path.basename(os.path.dirname(args.od_output_file))
+
+
+def convert_row_to_swebench_format(row):
+    return {
+        'instance_id': row['instance_id'],
+        'model_patch': row['git_patch'].replace('\r\n', '\n'),
+        'model_name_or_path': model_name,
+    }
+
+
+swebench_format = od_format.apply(convert_row_to_swebench_format, axis=1)
+swebench_format.to_json(output_filepath, lines=True, orient='records')
--- a/evaluation/swe_bench/scripts/eval/download_swe_bench_data.py
+++ b/evaluation/swe_bench/scripts/eval/download_swe_bench_data.py
@ -0,0 +1,34 @@
+import argparse
+import json
+
+import pandas as pd
+from datasets import load_dataset
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    'output_dir',
+    type=str,
+    default='eval_data/instances',
+    help='Path to the directory to save the instances.',
+)
+args = parser.parse_args()
+
+dataset = load_dataset('princeton-nlp/SWE-bench')
+test = dataset['test'].to_pandas()
+test['FAIL_TO_PASS'] = test['FAIL_TO_PASS'].apply(json.loads)
+test['PASS_TO_PASS'] = test['PASS_TO_PASS'].apply(json.loads)
+test.to_json(f'{args.output_dir}/swe-bench-test.json', orient='records')
+
+dataset = load_dataset('princeton-nlp/SWE-bench_Lite')
+test = dataset['test'].to_pandas()
+test['FAIL_TO_PASS'] = test['FAIL_TO_PASS'].apply(json.loads)
+test['PASS_TO_PASS'] = test['PASS_TO_PASS'].apply(json.loads)
+test.to_json(f'{args.output_dir}/swe-bench-lite-test.json', orient='records')
+
+dev = dataset['dev'].to_pandas()
+dev['FAIL_TO_PASS'] = dev['FAIL_TO_PASS'].apply(json.loads)
+dev['PASS_TO_PASS'] = dev['PASS_TO_PASS'].apply(json.loads)
+dev.to_json(f'{args.output_dir}/swe-bench-lite-dev.json', orient='records')
+
+all_data = pd.concat([test, dev])
+all_data.to_json(f'{args.output_dir}/swe-bench-lite-all.json', orient='records')
--- a/evaluation/swe_bench/scripts/eval/prep_eval.sh
+++ b/evaluation/swe_bench/scripts/eval/prep_eval.sh
@ -0,0 +1,11 @@
+#!/bin/bash
+
+echo "Cloning OpenDevin SWE-Bench Fork"
+git clone https://github.com/OpenDevin/SWE-bench.git evaluation/swe_bench/eval_workspace/SWE-bench
+
+echo "Pulling all evaluation dockers..."
+evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh
+
+echo "Downloading SWE-bench data..."
+mkdir -p evaluation/swe_bench/eval_workspace/eval_data/instances
+poetry run python3 evaluation/swe_bench/scripts/eval/download_swe_bench_data.py evaluation/swe_bench/eval_workspace/eval_data/instances
--- a/evaluation/swe_bench/scripts/eval_infer.sh
+++ b/evaluation/swe_bench/scripts/eval_infer.sh
@ -11,25 +11,91 @@ if [ ! -f $PROCESS_FILEPATH ]; then
    exit 1
 fi

+# If instance_id is empty, it means we want to eval on the whole $PROCESS_FILEPATH
+# otherwise, we want to eval on the instance_id
+INSTANCE_ID=$2
+echo "INSTANCE_ID: $INSTANCE_ID"
+
 PROCESS_FILEPATH=$(realpath $PROCESS_FILEPATH)
 FILE_DIR=$(dirname $PROCESS_FILEPATH)
 FILE_NAME=$(basename $PROCESS_FILEPATH)
-mkdir -p $FILE_DIR/eval_logs
+mkdir -p $FILE_DIR/logs
 mkdir -p $FILE_DIR/swe_bench_format

 echo "Evaluating $FILE_NAME @ $FILE_DIR"
-echo "Merged output file with fine-grained report will be saved to $FILE_DIR"
+DOCKERHUB_NAMESPACE="xingyaoww"
+SWEBENCH_TASKS=$(realpath evaluation/swe_bench/eval_workspace/eval_data/instances/swe-bench-lite-all.json)
+export SWEBENCH_DOCKER_FORK_DIR=$(realpath evaluation/swe_bench/eval_workspace/SWE-bench-docker)

-docker run --rm \
-    -v $FILE_DIR:/swe_bench_output \
-    -e MINICONDA3=/swe_util/miniforge3 \
-    -e OD_SWE_BENCH=/swe_util/OD-SWE-bench \
-    -e EVAL_DATA_DIR=/swe_util/eval_data \
-    -w /swe_util \
-    ghcr.io/opendevin/eval-swe-bench:full-v1.2.1 \
-    bash -c "./get_agent_report.sh --output-file /swe_bench_output/$FILE_NAME \
-    --agent-name CodeActAgent \
-    --dataset swe-bench-test-lite \
-    --experiment-name test_experiment \
-    --merge-report && cp -r /swe_util/eval_data/eval_logs/test_experiment/* /swe_bench_output/eval_logs \
-    && cp -r /swe_util/eval_data/outputs/* /swe_bench_output/swe_bench_format/"
+# ================================================
+# detect whether PROCESS_FILEPATH is in OD format or in SWE-bench format
+echo "=============================================================="
+echo "Detecting whether PROCESS_FILEPATH is in OD format or in SWE-bench format"
+echo "=============================================================="
+# SWE-bench format is a JSONL where every line has three fields: model_name_or_path, instance_id, and model_patch
+function is_swebench_format() {
+    # Read the first line of the file
+    read -r first_line < "$PROCESS_FILEPATH"
+
+    # Use jq to check if the first line has the required fields
+    echo "$first_line" | jq -e '. | has("model_name_or_path") and has("instance_id") and has("model_patch")' > /dev/null
+
+    if [ $? -ne 0 ]; then
+        return 1 # Return 1 if the first line does not have the required fields
+    fi
+
+    return 0 # Return 0 if the first line has the required fields
+}
+# Call the function with the file path
+is_swebench_format "$PROCESS_FILEPATH"
+IS_SWEBENCH_FORMAT=$?
+# Use the result in an if-else statement
+if [ $IS_SWEBENCH_FORMAT -eq 0 ]; then
+    echo "The file IS in SWE-bench format."
+    SWEBENCH_FORMAT_JSONL=$PROCESS_FILEPATH
+else
+    echo "The file IS NOT in SWE-bench format."
+
+    # ==== Convert OD format to SWE-bench format ====
+    echo "Merged output file with fine-grained report will be saved to $FILE_DIR"
+    poetry run python3 evaluation/swe_bench/scripts/eval/convert_od_output_to_swe_json.py $PROCESS_FILEPATH
+    # replace .jsonl with .swebench.jsonl in filename
+    SWEBENCH_FORMAT_JSONL=${PROCESS_FILEPATH/.jsonl/.swebench.jsonl}
+    echo "SWEBENCH_FORMAT_JSONL: $SWEBENCH_FORMAT_JSONL"
+    # assert that the file exists
+    if [ ! -f $SWEBENCH_FORMAT_JSONL ]; then
+        echo "Error: $SWEBENCH_FORMAT_JSONL does not exist. There is probably an error in the conversion process."
+        exit 1
+    fi
+    SWEBENCH_FORMAT_JSONL=$(realpath $SWEBENCH_FORMAT_JSONL)
+fi
+# ================================================
+
+echo "=============================================================="
+echo "Running SWE-bench evaluation"
+echo "=============================================================="
+
+if [ -z "$INSTANCE_ID" ]; then
+    echo "Running SWE-bench evaluation on the whole input file..."
+
+    poetry run python $SWEBENCH_DOCKER_FORK_DIR/run_evaluation.py \
+        --predictions_path $SWEBENCH_FORMAT_JSONL \
+        --log_dir $FILE_DIR/logs \
+        --swe_bench_tasks $SWEBENCH_TASKS \
+        --namespace $DOCKERHUB_NAMESPACE \
+        --timeout 1800
+
+else
+    echo "Running SWE-bench evaluation on the instance_id: $INSTANCE_ID"
+    poetry run python $SWEBENCH_DOCKER_FORK_DIR/run_single_instance.py \
+        --predictions_path $SWEBENCH_FORMAT_JSONL \
+        --swe_bench_tasks $SWEBENCH_TASKS \
+        --namespace $DOCKERHUB_NAMESPACE \
+        --instance_id $INSTANCE_ID
+fi
+
+poetry run python $SWEBENCH_DOCKER_FORK_DIR/generate_report.py \
+    --predictions_path $SWEBENCH_FORMAT_JSONL \
+    --log_dir $FILE_DIR/logs \
+    --output_dir $FILE_DIR \
+    --swe_bench_tasks $SWEBENCH_TASKS
--- a/evaluation/swe_bench/scripts/summarise_results.py
+++ b/evaluation/swe_bench/scripts/summarise_results.py
@ -3,37 +3,37 @@ import sys


 def extract_test_results(json_file_path):
-    passed_tests = []
-    failed_tests = []
+    passed_instances = set()
+    all_instances = set()
+
    with open(json_file_path, 'r') as file:
-        for line in file:
-            data = json.loads(line.strip())
-            instance_id = data['instance_id']
-            resolved = False
-            if 'fine_grained_report' in data:
-                resolved = data['fine_grained_report']['resolved']
-            else:
-                resolved = data['test_result']['result']['resolved']
-            if resolved:
-                passed_tests.append(instance_id)
-            else:
-                failed_tests.append(instance_id)
-    return passed_tests, failed_tests
+        report = json.load(file)
+
+        # Add resolved instances
+        for instance_id in report['resolved']:
+            passed_instances.add(instance_id)
+
+        # Add all instances in the report
+        for _, instance_ids in report.items():
+            for instance_id in instance_ids:
+                all_instances.add(instance_id)
+
+    return passed_instances, all_instances


 if __name__ == '__main__':
    if len(sys.argv) != 2:
        print(
-            'Usage: poetry run python summarise_results.py <path_to_output_merged_jsonl_file>'
+            'Usage: poetry run python summarise_results.py <path_to_report_json_file>'
        )
        sys.exit(1)
    json_file_path = sys.argv[1]
-    passed_tests, failed_tests = extract_test_results(json_file_path)
-    succ_rate = len(passed_tests) / (len(passed_tests) + len(failed_tests))
+    passed_instances, all_instances = extract_test_results(json_file_path)
+    succ_rate = len(passed_instances) / len(all_instances)
    print(
-        f'\nPassed {len(passed_tests)} tests, failed {len(failed_tests)} tests, resolve rate = {succ_rate}'
+        f'\nPassed {len(passed_instances)} tests, total {len(all_instances)} tests, resolve rate = {succ_rate:.2%}'
    )
    print('PASSED TESTS:')
-    print(passed_tests)
+    print(sorted(list(passed_instances)))
    print('FAILED TESTS:')
-    print(failed_tests)
+    print(sorted(list(all_instances - passed_instances)))
--- a/poetry.lock
+++ b/poetry.lock
@ -1437,6 +1437,23 @@ fastapi = "*"
 typer = ">=0.12.3"
 uvicorn = {version = ">=0.15.0", extras = ["standard"]}

+[[package]]
+name = "fastcore"
+version = "1.5.38"
+description = "Python supercharged for fastai development"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "fastcore-1.5.38-py3-none-any.whl", hash = "sha256:327f011613c986e7f627f63d1d9993c8d6de116c586df94d85806fbfbe45e52a"},
+    {file = "fastcore-1.5.38.tar.gz", hash = "sha256:7732403778de9bc2b25bf52617c7fbb9e7ae96010f534a5f00f7e6dee73f1d39"},
+]
+
+[package.dependencies]
+packaging = "*"
+
+[package.extras]
+dev = ["matplotlib", "nbclassic", "nbdev (>=0.2.39)", "numpy", "pandas", "pillow", "torch"]
+
 [[package]]
 name = "filelock"
 version = "3.14.0"
@ -1754,6 +1771,25 @@ monitor = ["psutil (>=5.7.0)"]
 recommended = ["cffi (>=1.12.2)", "dnspython (>=1.16.0,<2.0)", "idna", "psutil (>=5.7.0)"]
 test = ["cffi (>=1.12.2)", "coverage (>=5.0)", "dnspython (>=1.16.0,<2.0)", "idna", "objgraph", "psutil (>=5.7.0)", "requests"]

+[[package]]
+name = "ghapi"
+version = "1.0.5"
+description = "A python client for the GitHub API"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "ghapi-1.0.5-py3-none-any.whl", hash = "sha256:24a851b7a256861f173437c807701beac3857a84979067ddc25a8555868ce6dc"},
+    {file = "ghapi-1.0.5.tar.gz", hash = "sha256:57f170d50d4e6cbf475d234056c54b1ea7bb917b96b0a19798f6127d8a0c40b1"},
+]
+
+[package.dependencies]
+fastcore = ">=1.5.4"
+packaging = "*"
+pip = "*"
+
+[package.extras]
+dev = ["jsonref", "matplotlib"]
+
 [[package]]
 name = "gitdb"
 version = "4.0.11"
@ -3104,13 +3140,9 @@ files = [
    {file = "lxml-5.2.2-cp36-cp36m-win_amd64.whl", hash = "sha256:edcfa83e03370032a489430215c1e7783128808fd3e2e0a3225deee278585196"},
    {file = "lxml-5.2.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:28bf95177400066596cdbcfc933312493799382879da504633d16cf60bba735b"},
    {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3a745cc98d504d5bd2c19b10c79c61c7c3df9222629f1b6210c0368177589fb8"},
-    {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1b590b39ef90c6b22ec0be925b211298e810b4856909c8ca60d27ffbca6c12e6"},
    {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b336b0416828022bfd5a2e3083e7f5ba54b96242159f83c7e3eebaec752f1716"},
-    {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_28_aarch64.whl", hash = "sha256:c2faf60c583af0d135e853c86ac2735ce178f0e338a3c7f9ae8f622fd2eb788c"},
    {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:4bc6cb140a7a0ad1f7bc37e018d0ed690b7b6520ade518285dc3171f7a117905"},
-    {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:7ff762670cada8e05b32bf1e4dc50b140790909caa8303cfddc4d702b71ea184"},
    {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:57f0a0bbc9868e10ebe874e9f129d2917750adf008fe7b9c1598c0fbbfdde6a6"},
-    {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:a6d2092797b388342c1bc932077ad232f914351932353e2e8706851c870bca1f"},
    {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:60499fe961b21264e17a471ec296dcbf4365fbea611bf9e303ab69db7159ce61"},
    {file = "lxml-5.2.2-cp37-cp37m-win32.whl", hash = "sha256:d9b342c76003c6b9336a80efcc766748a333573abf9350f4094ee46b006ec18f"},
    {file = "lxml-5.2.2-cp37-cp37m-win_amd64.whl", hash = "sha256:b16db2770517b8799c79aa80f4053cd6f8b716f21f8aca962725a9565ce3ee40"},
@ -4508,6 +4540,17 @@ tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "pa
 typing = ["typing-extensions"]
 xmp = ["defusedxml"]

+[[package]]
+name = "pip"
+version = "24.0"
+description = "The PyPA recommended tool for installing Python packages."
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "pip-24.0-py3-none-any.whl", hash = "sha256:ba0d021a166865d2265246961bec0152ff124de910c5cc39f1156ce3fa7c69dc"},
+    {file = "pip-24.0.tar.gz", hash = "sha256:ea9bd1a847e8c5774a5777bb398c19e80bcd4e2aa16a4b301b718fe6f593aba2"},
+]
+
 [[package]]
 name = "platformdirs"
 version = "4.2.2"
@ -6178,6 +6221,32 @@ files = [
    {file = "striprtf-0.0.26.tar.gz", hash = "sha256:fdb2bba7ac440072d1c41eab50d8d74ae88f60a8b6575c6e2c7805dc462093aa"},
 ]

+[[package]]
+name = "swebench"
+version = "1.1.5"
+description = "The official SWE-bench package - a benchmark for evaluating LMs on software engineering"
+optional = false
+python-versions = ">=3.8"
+files = []
+develop = false
+
+[package.dependencies]
+beautifulsoup4 = "*"
+chardet = "*"
+datasets = "*"
+ghapi = "*"
+GitPython = "*"
+python-dotenv = "*"
+requests = "*"
+rich = "*"
+tqdm = "*"
+
+[package.source]
+type = "git"
+url = "https://github.com/OpenDevin/SWE-bench.git"
+reference = "HEAD"
+resolved_reference = "7b0c4b1c249ed4b4600a5bba8afb916d543e034a"
+
 [[package]]
 name = "sympy"
 version = "1.12"
@ -7560,4 +7629,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "35c94ddfb97929402ce2094be893045d7cecd9c56517b2fce271094e5cebec2d"
+content-hash = "6ecc369caf1256f86a6cfb642213180173c011eb6de7ffecac002ce5d0b4a661"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -67,11 +67,13 @@ reportlab = "*"
 [tool.coverage.run]
 concurrency = ["gevent"]

+
 [tool.poetry.group.evaluation.dependencies]
 streamlit = "*"
 whatthepatch = "*"
 retry = "*"
 evaluate = "*"
+swebench = { git = "https://github.com/OpenDevin/SWE-bench.git" }

 [build-system]
 build-backend = "poetry.core.masonry.api"