diff --git a/evaluation/swe_bench/EVAL_PATCH.md b/evaluation/swe_bench/EVAL_PATCH.md deleted file mode 100644 index ec2ac6c023..0000000000 --- a/evaluation/swe_bench/EVAL_PATCH.md +++ /dev/null @@ -1,256 +0,0 @@ -# Evaluate Generated Patches - -## Evaluate patches generated by OpenDevin - -This section explains in detail how `evaluation/swe_bench/scripts/eval_infer.sh` described in [SWE-Bench README](./README.md) works. - -Use `scripts/setup/get_agent_report.sh` to evaluate patches generated by an OpenDevin agent. This script is available in the container at `/swe_util/get_agent_report.sh`. - -- `output-file` (*required*): specify the path to your patch file inside the container -- `agent-name` (*required*): your agent name -- `dataset` (*required*): `swe-bench-test-lite` or `swe-bench-test` -- `num-processes`: defaults to 15. -- `experiment-name`: set to `${parent_folder_of_output_fils}_${current_folder_of_output_file}` if not given. E.g., `xxx/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v2_cd/output.jsonl` -> `CodeActAgent_gpt-4-1106-preview_maxiter_50_N_v2_cd` as experiment name. -- `merge_report`: if set, merges the evaluation report into the original output jsonl file and saves as a `.merged.jsonl` file. - -An example to run evaluation on the given example agent output (`./examples/example_agent_output.json`). - -```shell -export MINICONDA3=/swe_util/miniforge3 -export OD_SWE_BENCH=/OD-SWE-bench -export EVAL_DATA_DIR=/swe_util/eval_data -cd /swe_util && ./get_agent_report.sh --output-file /swe_bench_output/example_agent_output.jsonl \ ---agent-name CodeActAgent \ ---dataset swe-bench-test-lite \ ---experiment-name test_experiment \ ---merge-report -``` - -You should get the following report: -```shell -- no_generation: 4 -- generated: 26 -- with_logs: 26 -- install_fail: 0 -- reset_failed: 0 -- no_apply: 0 -- applied: 24 -- test_errored: 0 -- test_timeout: 0 -- resolved: 6 -['sphinx-doc__sphinx-8721', 'sympy__sympy-14774', 'django__django-17087', 'sympy__sympy-20590', 'django__django-11583', 'sympy__sympy-21612'] -Report saved at /swe_util/eval_data/eval_logs/test_experiment/test_experiment_swe-bench-test-lite.report.json -Agent output with report merged created at /swe_bench_output/example_agent_output.merged.jsonl -``` - -An additional `fine_grained_report` field will be added to each instance in the `example_agent_output.merged.jsonl`. - -```json -"fine_grained_report": { - "gold_tests": { - "FAIL_TO_PASS": "[\"tests/test_ext_viewcode.py::test_viewcode_epub_default\"]", - "PASS_TO_PASS": "[\"tests/test_ext_viewcode.py::test_viewcode_epub_enabled\", \"tests/test_ext_viewcode.py::test_linkcode\", \"tests/test_ext_viewcode.py::test_local_source_files\"]" - }, - "generated": true, - "with_logs": true, - "applied": true, - "test_errored": false, - "test_timeout": false, - "resolved": true, - "log_parse": { - "tests/test_ext_viewcode.py::test_viewcode_epub_default": "PASSED", - "tests/test_ext_viewcode.py::test_viewcode_epub_enabled": "PASSED", - "tests/test_ext_viewcode.py::test_linkcode": "PASSED", - "tests/test_ext_viewcode.py::test_local_source_files": "PASSED", - "tests/test_ext_viewcode.py::test_viewcode": "FAILED" - }, - "eval_report": { - "FAIL_TO_PASS": { - "success": [ - "tests/test_ext_viewcode.py::test_viewcode_epub_default" - ], - "failure": [] - }, - "PASS_TO_PASS": { - "success": [ - "tests/test_ext_viewcode.py::test_viewcode_epub_enabled", - "tests/test_ext_viewcode.py::test_linkcode", - "tests/test_ext_viewcode.py::test_local_source_files" - ], - "failure": [] - }, - "FAIL_TO_FAIL": { - "success": [], - "failure": [] - }, - "PASS_TO_FAIL": { - "success": [], - "failure": [] - } - } -} -``` - -## If you already have patches not generated by OpenDevin - -### Prepare Output Files - -Ensure that model outputs are formatted correctly as below: -```json -[ - { - "instance_id": "", - "model_patch": "", - "model_name_or_path": "" - }, - ... -] -``` -An example can be found [here](./examples/example_model_output.json). - -Agent output should be adhere to the OpenDevin format. An example can be found [here](./examples/example_agent_output.json). - -### Set Up the Environment - -Before evaluating generated patches, you need to set up the Docker environment. Run the following command to instantiate the Docker container and mount the directory to your output files on the host: - -```shell -docker run -it \ --v DIR_TO_YOUR_PATCH_FILES_ON_HOST:/swe_bench_output \ -ghcr.io/opendevin/eval-swe-bench:full-v1.2.1 /bin/bash -``` - -### Evaluate Model Generated Patches - -Use `scripts/get_model_report.sh` to evaluate patches generated by a model. This script is located in the container at `/swe_util/get_model_report.sh`. - -- `output-file` (*required*): specify the path to your patch file inside the container -- `model-name` (*required*): this must match the `model_name_or_path` in your patch file -- `dataset` (*required*): `swe-bench-test-lite` or `swe-bench-test` -- `num-processes`: defaults to 15. -- `experiment-name`: set to `{model-name}__{dataset}` unless specified - -An example to run evaluation on the given example model output (`./examples/example_agent_output.json`). - -```shell -export MINICONDA3=/swe_util/miniforge3 -export OD_SWE_BENCH=/swe_util/OD-SWE-bench -export EVAL_DATA_DIR=/swe_util/eval_data -cd /swe_util && ./get_model_report.sh --output-file /swe_bench_output/example_model_output.json \ ---model-name opendevin \ ---dataset swe-bench-test-lite -``` - -You should get the following report: -```shell -- no_generation: 4 -- generated: 26 -- with_logs: 26 -- install_fail: 0 -- reset_failed: 0 -- no_apply: 0 -- applied: 24 -- test_errored: 0 -- test_timeout: 0 -- resolved: 6 -['sphinx-doc__sphinx-8721', 'sympy__sympy-14774', 'django__django-17087', 'sympy__sympy-20590', 'django__django-11583', 'sympy__sympy-21612'] -Report saved at /swe_util/eval_data/eval_logs/opendevin__swe-bench-test-lite/example_model_output.report.json -``` -Note: please ignore the `no_apply` in the report for now. - -The script will generate a `{experiment_name}` folder under `$EVAL_DATA_DIR/eval_logs` -```shell -├── $EVAL_DATA_DIR/eval_logs/$experiment_name -│ ├── $experiment_name.json -│ ├── $experiment_name.report.json -│ ├── $model_name # eval log dir -``` - -### Evaluate Agent Generated Patches - -Use `scripts/setup/get_agent_report.sh` to evaluate patches generated by an agent. This script is available in the container at `/swe_util/get_agent_report.sh`. - -- `output-file` (*required*): specify the path to your patch file inside the container -- `agent-name` (*required*): your agent name -- `dataset` (*required*): `swe-bench-test-lite` or `swe-bench-test` -- `num-processes`: defaults to 15. -- `experiment-name`: set to `${parent_folder_of_output_fils}_${current_folder_of_output_file}` if not given. E.g., `xxx/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v2_cd/output.jsonl` -> `CodeActAgent_gpt-4-1106-preview_maxiter_50_N_v2_cd` as experiment name. -- `merge_report`: if set, merges the evaluation report into the original output jsonl file and saves as a `.merged.jsonl` file. - -An example to run evaluation on the given example agent output (`./examples/example_agent_output.json`). - -```shell -export MINICONDA3=/swe_util/miniforge3 -export OD_SWE_BENCH=/OD-SWE-bench -export EVAL_DATA_DIR=/swe_util/eval_data -cd /swe_util && ./get_agent_report.sh --output-file /swe_bench_output/example_agent_output.jsonl \ ---agent-name CodeActAgent \ ---dataset swe-bench-test-lite \ ---experiment-name test_experiment \ ---merge-report -``` - -You should get the following report: -```shell -- no_generation: 4 -- generated: 26 -- with_logs: 26 -- install_fail: 0 -- reset_failed: 0 -- no_apply: 0 -- applied: 24 -- test_errored: 0 -- test_timeout: 0 -- resolved: 6 -['sphinx-doc__sphinx-8721', 'sympy__sympy-14774', 'django__django-17087', 'sympy__sympy-20590', 'django__django-11583', 'sympy__sympy-21612'] -Report saved at /swe_util/eval_data/eval_logs/test_experiment/test_experiment_swe-bench-test-lite.report.json -Agent output with report merged created at /swe_bench_output/example_agent_output.merged.jsonl -``` - -An additional `fine_grained_report` field will be added to each instance in the `example_agent_output.merged.jsonl`. - -```json -"fine_grained_report": { - "gold_tests": { - "FAIL_TO_PASS": "[\"tests/test_ext_viewcode.py::test_viewcode_epub_default\"]", - "PASS_TO_PASS": "[\"tests/test_ext_viewcode.py::test_viewcode_epub_enabled\", \"tests/test_ext_viewcode.py::test_linkcode\", \"tests/test_ext_viewcode.py::test_local_source_files\"]" - }, - "generated": true, - "with_logs": true, - "applied": true, - "test_errored": false, - "test_timeout": false, - "resolved": true, - "log_parse": { - "tests/test_ext_viewcode.py::test_viewcode_epub_default": "PASSED", - "tests/test_ext_viewcode.py::test_viewcode_epub_enabled": "PASSED", - "tests/test_ext_viewcode.py::test_linkcode": "PASSED", - "tests/test_ext_viewcode.py::test_local_source_files": "PASSED", - "tests/test_ext_viewcode.py::test_viewcode": "FAILED" - }, - "eval_report": { - "FAIL_TO_PASS": { - "success": [ - "tests/test_ext_viewcode.py::test_viewcode_epub_default" - ], - "failure": [] - }, - "PASS_TO_PASS": { - "success": [ - "tests/test_ext_viewcode.py::test_viewcode_epub_enabled", - "tests/test_ext_viewcode.py::test_linkcode", - "tests/test_ext_viewcode.py::test_local_source_files" - ], - "failure": [] - }, - "FAIL_TO_FAIL": { - "success": [], - "failure": [] - }, - "PASS_TO_FAIL": { - "success": [], - "failure": [] - } - } -} -``` diff --git a/evaluation/swe_bench/README.md b/evaluation/swe_bench/README.md index 052aa1616f..958d6b7649 100644 --- a/evaluation/swe_bench/README.md +++ b/evaluation/swe_bench/README.md @@ -127,6 +127,12 @@ If you want to evaluate existing results, you should first run this to clone exi git clone https://huggingface.co/spaces/OpenDevin/evaluation evaluation/evaluation_outputs ``` +To prepare for swe-bench evaluation, you should pull evaluation docker from [OpenDevin/SWE-bench-docker](https://github.com/OpenDevin/SWE-bench-docker) and download swe-bench data by running: + +```bash +evaluation/swe_bench/scripts/eval/prep_eval.sh +``` + Then you can run the following: ```bash @@ -135,55 +141,14 @@ Then you can run the following: ./evaluation/swe_bench/scripts/eval_infer.sh evaluation/evaluation_outputs/outputs/swe_bench/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v1.0/output.jsonl ``` -The final results will be saved to `evaluation/evaluation_outputs/outputs/swe_bench/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v1.0/output.merged.jsonl`. +PS: You can also pass in a JSONL with [SWE-Bench format](https://github.com/princeton-nlp/SWE-bench/blob/main/tutorials/evaluation.md#-creating-predictions) to `./evaluation/swe_bench/scripts/eval_infer.sh`, where each line is a JSON of `{"model_patch": "XXX", "model_name_or_path": "YYY", "instance_id": "ZZZ"}`. -It will contain an additional field `fine_grained_report` (see example below) compared to the `output.jsonl` from the previous inference stage. +The final results will be saved to `evaluation/evaluation_outputs/outputs/swe_bench/CodeActAgent/gpt-4-1106-preview_maxiter_50_N_v1.0/` with the following files/directory (following format of [SWE-bench-docker](https://github.com/aorwall/SWE-bench-docker/tree/main/evaluations/SWE-bench_Lite_golden)): -```json -"fine_grained_report": { - "gold_tests": { - "FAIL_TO_PASS": "[\"tests/test_ext_viewcode.py::test_viewcode_epub_default\"]", - "PASS_TO_PASS": "[\"tests/test_ext_viewcode.py::test_viewcode_epub_enabled\", \"tests/test_ext_viewcode.py::test_linkcode\", \"tests/test_ext_viewcode.py::test_local_source_files\"]" - }, - "generated": true, - "with_logs": true, - "applied": true, - "test_errored": false, - "test_timeout": false, - "resolved": true, - "log_parse": { - "tests/test_ext_viewcode.py::test_viewcode_epub_default": "PASSED", - "tests/test_ext_viewcode.py::test_viewcode_epub_enabled": "PASSED", - "tests/test_ext_viewcode.py::test_linkcode": "PASSED", - "tests/test_ext_viewcode.py::test_local_source_files": "PASSED", - "tests/test_ext_viewcode.py::test_viewcode": "FAILED" - }, - "eval_report": { - "FAIL_TO_PASS": { - "success": [ - "tests/test_ext_viewcode.py::test_viewcode_epub_default" - ], - "failure": [] - }, - "PASS_TO_PASS": { - "success": [ - "tests/test_ext_viewcode.py::test_viewcode_epub_enabled", - "tests/test_ext_viewcode.py::test_linkcode", - "tests/test_ext_viewcode.py::test_local_source_files" - ], - "failure": [] - }, - "FAIL_TO_FAIL": { - "success": [], - "failure": [] - }, - "PASS_TO_FAIL": { - "success": [], - "failure": [] - } - } -} -``` +- `README.md`: a report showing what are the instances that passed, failed, etc. +- `logs/`: a directory of test logs +- `report.json`: a JSON file that contains keys like `"resolved"` pointing to instance IDs that are resolved by the agent. +- `summary.json`: a JSON file contains more fine-grained information for each test instance. Please refer to [EVAL_PATCH.md](./EVAL_PATCH.md) if you want to learn more about how to evaluate patches that are already generated (e.g., not by OpenDevin). @@ -192,8 +157,8 @@ Please refer to [EVAL_PATCH.md](./EVAL_PATCH.md) if you want to learn more about If you just want to know the resolve rate, and/or a summary of what tests pass and what don't, you could run ```bash -poetry run python ./evaluation/swe_bench/scripts/summarise_results.py -# e.g. poetry run python ./evaluation/swe_bench/scripts/summarise_results.py ./evaluation/evaluation_outputs/outputs/swe_bench_lite/CodeActSWEAgent/gpt-4o-2024-05-13_maxiter_50_N_v1.5-no-hint/output.merged.jsonl +poetry run python ./evaluation/swe_bench/scripts/summarise_results.py +# e.g. poetry run python ./evaluation/swe_bench/scripts/summarise_results.py ./evaluation/evaluation_outputs/outputs/swe_bench_lite/CodeActSWEAgent/gpt-4o-2024-05-13_maxiter_50_N_v1.5-no-hint/report.json ``` ## Submit your evaluation results diff --git a/evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh b/evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh new file mode 100755 index 0000000000..5c884faa76 --- /dev/null +++ b/evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +mkdir evaluation/swe_bench/eval_workspace +pushd evaluation/swe_bench/eval_workspace +git clone https://github.com/OpenDevin/SWE-bench-docker.git +cd SWE-bench-docker +scripts/pull_docker_images.sh docker/ xingyaoww diff --git a/evaluation/swe_bench/scripts/eval/convert_od_output_to_swe_json.py b/evaluation/swe_bench/scripts/eval/convert_od_output_to_swe_json.py new file mode 100644 index 0000000000..e9958e6ef9 --- /dev/null +++ b/evaluation/swe_bench/scripts/eval/convert_od_output_to_swe_json.py @@ -0,0 +1,26 @@ +import argparse +import os + +import pandas as pd + +parser = argparse.ArgumentParser() +parser.add_argument('od_output_file', type=str) +args = parser.parse_args() +output_filepath = args.od_output_file.replace('.jsonl', '.swebench.jsonl') +print(f'Converting {args.od_output_file} to {output_filepath}') + +od_format = pd.read_json(args.od_output_file, orient='records', lines=True) +# model name is the folder name of od_output_file +model_name = os.path.basename(os.path.dirname(args.od_output_file)) + + +def convert_row_to_swebench_format(row): + return { + 'instance_id': row['instance_id'], + 'model_patch': row['git_patch'].replace('\r\n', '\n'), + 'model_name_or_path': model_name, + } + + +swebench_format = od_format.apply(convert_row_to_swebench_format, axis=1) +swebench_format.to_json(output_filepath, lines=True, orient='records') diff --git a/evaluation/swe_bench/scripts/eval/download_swe_bench_data.py b/evaluation/swe_bench/scripts/eval/download_swe_bench_data.py new file mode 100644 index 0000000000..57c0616a8c --- /dev/null +++ b/evaluation/swe_bench/scripts/eval/download_swe_bench_data.py @@ -0,0 +1,34 @@ +import argparse +import json + +import pandas as pd +from datasets import load_dataset + +parser = argparse.ArgumentParser() +parser.add_argument( + 'output_dir', + type=str, + default='eval_data/instances', + help='Path to the directory to save the instances.', +) +args = parser.parse_args() + +dataset = load_dataset('princeton-nlp/SWE-bench') +test = dataset['test'].to_pandas() +test['FAIL_TO_PASS'] = test['FAIL_TO_PASS'].apply(json.loads) +test['PASS_TO_PASS'] = test['PASS_TO_PASS'].apply(json.loads) +test.to_json(f'{args.output_dir}/swe-bench-test.json', orient='records') + +dataset = load_dataset('princeton-nlp/SWE-bench_Lite') +test = dataset['test'].to_pandas() +test['FAIL_TO_PASS'] = test['FAIL_TO_PASS'].apply(json.loads) +test['PASS_TO_PASS'] = test['PASS_TO_PASS'].apply(json.loads) +test.to_json(f'{args.output_dir}/swe-bench-lite-test.json', orient='records') + +dev = dataset['dev'].to_pandas() +dev['FAIL_TO_PASS'] = dev['FAIL_TO_PASS'].apply(json.loads) +dev['PASS_TO_PASS'] = dev['PASS_TO_PASS'].apply(json.loads) +dev.to_json(f'{args.output_dir}/swe-bench-lite-dev.json', orient='records') + +all_data = pd.concat([test, dev]) +all_data.to_json(f'{args.output_dir}/swe-bench-lite-all.json', orient='records') diff --git a/evaluation/swe_bench/scripts/eval/prep_eval.sh b/evaluation/swe_bench/scripts/eval/prep_eval.sh new file mode 100755 index 0000000000..0fdbd1afe0 --- /dev/null +++ b/evaluation/swe_bench/scripts/eval/prep_eval.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +echo "Cloning OpenDevin SWE-Bench Fork" +git clone https://github.com/OpenDevin/SWE-bench.git evaluation/swe_bench/eval_workspace/SWE-bench + +echo "Pulling all evaluation dockers..." +evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh + +echo "Downloading SWE-bench data..." +mkdir -p evaluation/swe_bench/eval_workspace/eval_data/instances +poetry run python3 evaluation/swe_bench/scripts/eval/download_swe_bench_data.py evaluation/swe_bench/eval_workspace/eval_data/instances diff --git a/evaluation/swe_bench/scripts/eval_infer.sh b/evaluation/swe_bench/scripts/eval_infer.sh index e3a37082b7..32424f7791 100755 --- a/evaluation/swe_bench/scripts/eval_infer.sh +++ b/evaluation/swe_bench/scripts/eval_infer.sh @@ -11,25 +11,91 @@ if [ ! -f $PROCESS_FILEPATH ]; then exit 1 fi +# If instance_id is empty, it means we want to eval on the whole $PROCESS_FILEPATH +# otherwise, we want to eval on the instance_id +INSTANCE_ID=$2 +echo "INSTANCE_ID: $INSTANCE_ID" + PROCESS_FILEPATH=$(realpath $PROCESS_FILEPATH) FILE_DIR=$(dirname $PROCESS_FILEPATH) FILE_NAME=$(basename $PROCESS_FILEPATH) -mkdir -p $FILE_DIR/eval_logs +mkdir -p $FILE_DIR/logs mkdir -p $FILE_DIR/swe_bench_format echo "Evaluating $FILE_NAME @ $FILE_DIR" -echo "Merged output file with fine-grained report will be saved to $FILE_DIR" +DOCKERHUB_NAMESPACE="xingyaoww" +SWEBENCH_TASKS=$(realpath evaluation/swe_bench/eval_workspace/eval_data/instances/swe-bench-lite-all.json) +export SWEBENCH_DOCKER_FORK_DIR=$(realpath evaluation/swe_bench/eval_workspace/SWE-bench-docker) -docker run --rm \ - -v $FILE_DIR:/swe_bench_output \ - -e MINICONDA3=/swe_util/miniforge3 \ - -e OD_SWE_BENCH=/swe_util/OD-SWE-bench \ - -e EVAL_DATA_DIR=/swe_util/eval_data \ - -w /swe_util \ - ghcr.io/opendevin/eval-swe-bench:full-v1.2.1 \ - bash -c "./get_agent_report.sh --output-file /swe_bench_output/$FILE_NAME \ - --agent-name CodeActAgent \ - --dataset swe-bench-test-lite \ - --experiment-name test_experiment \ - --merge-report && cp -r /swe_util/eval_data/eval_logs/test_experiment/* /swe_bench_output/eval_logs \ - && cp -r /swe_util/eval_data/outputs/* /swe_bench_output/swe_bench_format/" +# ================================================ +# detect whether PROCESS_FILEPATH is in OD format or in SWE-bench format +echo "==============================================================" +echo "Detecting whether PROCESS_FILEPATH is in OD format or in SWE-bench format" +echo "==============================================================" +# SWE-bench format is a JSONL where every line has three fields: model_name_or_path, instance_id, and model_patch +function is_swebench_format() { + # Read the first line of the file + read -r first_line < "$PROCESS_FILEPATH" + + # Use jq to check if the first line has the required fields + echo "$first_line" | jq -e '. | has("model_name_or_path") and has("instance_id") and has("model_patch")' > /dev/null + + if [ $? -ne 0 ]; then + return 1 # Return 1 if the first line does not have the required fields + fi + + return 0 # Return 0 if the first line has the required fields +} +# Call the function with the file path +is_swebench_format "$PROCESS_FILEPATH" +IS_SWEBENCH_FORMAT=$? +# Use the result in an if-else statement +if [ $IS_SWEBENCH_FORMAT -eq 0 ]; then + echo "The file IS in SWE-bench format." + SWEBENCH_FORMAT_JSONL=$PROCESS_FILEPATH +else + echo "The file IS NOT in SWE-bench format." + + # ==== Convert OD format to SWE-bench format ==== + echo "Merged output file with fine-grained report will be saved to $FILE_DIR" + poetry run python3 evaluation/swe_bench/scripts/eval/convert_od_output_to_swe_json.py $PROCESS_FILEPATH + # replace .jsonl with .swebench.jsonl in filename + SWEBENCH_FORMAT_JSONL=${PROCESS_FILEPATH/.jsonl/.swebench.jsonl} + echo "SWEBENCH_FORMAT_JSONL: $SWEBENCH_FORMAT_JSONL" + # assert that the file exists + if [ ! -f $SWEBENCH_FORMAT_JSONL ]; then + echo "Error: $SWEBENCH_FORMAT_JSONL does not exist. There is probably an error in the conversion process." + exit 1 + fi + SWEBENCH_FORMAT_JSONL=$(realpath $SWEBENCH_FORMAT_JSONL) +fi +# ================================================ + +echo "==============================================================" +echo "Running SWE-bench evaluation" +echo "==============================================================" + +if [ -z "$INSTANCE_ID" ]; then + echo "Running SWE-bench evaluation on the whole input file..." + + poetry run python $SWEBENCH_DOCKER_FORK_DIR/run_evaluation.py \ + --predictions_path $SWEBENCH_FORMAT_JSONL \ + --log_dir $FILE_DIR/logs \ + --swe_bench_tasks $SWEBENCH_TASKS \ + --namespace $DOCKERHUB_NAMESPACE \ + --timeout 1800 + +else + echo "Running SWE-bench evaluation on the instance_id: $INSTANCE_ID" + poetry run python $SWEBENCH_DOCKER_FORK_DIR/run_single_instance.py \ + --predictions_path $SWEBENCH_FORMAT_JSONL \ + --swe_bench_tasks $SWEBENCH_TASKS \ + --namespace $DOCKERHUB_NAMESPACE \ + --instance_id $INSTANCE_ID +fi + +poetry run python $SWEBENCH_DOCKER_FORK_DIR/generate_report.py \ + --predictions_path $SWEBENCH_FORMAT_JSONL \ + --log_dir $FILE_DIR/logs \ + --output_dir $FILE_DIR \ + --swe_bench_tasks $SWEBENCH_TASKS diff --git a/evaluation/swe_bench/scripts/summarise_results.py b/evaluation/swe_bench/scripts/summarise_results.py index bc40aef7c5..3c3469e319 100644 --- a/evaluation/swe_bench/scripts/summarise_results.py +++ b/evaluation/swe_bench/scripts/summarise_results.py @@ -3,37 +3,37 @@ import sys def extract_test_results(json_file_path): - passed_tests = [] - failed_tests = [] + passed_instances = set() + all_instances = set() + with open(json_file_path, 'r') as file: - for line in file: - data = json.loads(line.strip()) - instance_id = data['instance_id'] - resolved = False - if 'fine_grained_report' in data: - resolved = data['fine_grained_report']['resolved'] - else: - resolved = data['test_result']['result']['resolved'] - if resolved: - passed_tests.append(instance_id) - else: - failed_tests.append(instance_id) - return passed_tests, failed_tests + report = json.load(file) + + # Add resolved instances + for instance_id in report['resolved']: + passed_instances.add(instance_id) + + # Add all instances in the report + for _, instance_ids in report.items(): + for instance_id in instance_ids: + all_instances.add(instance_id) + + return passed_instances, all_instances if __name__ == '__main__': if len(sys.argv) != 2: print( - 'Usage: poetry run python summarise_results.py ' + 'Usage: poetry run python summarise_results.py ' ) sys.exit(1) json_file_path = sys.argv[1] - passed_tests, failed_tests = extract_test_results(json_file_path) - succ_rate = len(passed_tests) / (len(passed_tests) + len(failed_tests)) + passed_instances, all_instances = extract_test_results(json_file_path) + succ_rate = len(passed_instances) / len(all_instances) print( - f'\nPassed {len(passed_tests)} tests, failed {len(failed_tests)} tests, resolve rate = {succ_rate}' + f'\nPassed {len(passed_instances)} tests, total {len(all_instances)} tests, resolve rate = {succ_rate:.2%}' ) print('PASSED TESTS:') - print(passed_tests) + print(sorted(list(passed_instances))) print('FAILED TESTS:') - print(failed_tests) + print(sorted(list(all_instances - passed_instances))) diff --git a/poetry.lock b/poetry.lock index eb32f19af1..3ec91ef60b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1437,6 +1437,23 @@ fastapi = "*" typer = ">=0.12.3" uvicorn = {version = ">=0.15.0", extras = ["standard"]} +[[package]] +name = "fastcore" +version = "1.5.38" +description = "Python supercharged for fastai development" +optional = false +python-versions = ">=3.7" +files = [ + {file = "fastcore-1.5.38-py3-none-any.whl", hash = "sha256:327f011613c986e7f627f63d1d9993c8d6de116c586df94d85806fbfbe45e52a"}, + {file = "fastcore-1.5.38.tar.gz", hash = "sha256:7732403778de9bc2b25bf52617c7fbb9e7ae96010f534a5f00f7e6dee73f1d39"}, +] + +[package.dependencies] +packaging = "*" + +[package.extras] +dev = ["matplotlib", "nbclassic", "nbdev (>=0.2.39)", "numpy", "pandas", "pillow", "torch"] + [[package]] name = "filelock" version = "3.14.0" @@ -1754,6 +1771,25 @@ monitor = ["psutil (>=5.7.0)"] recommended = ["cffi (>=1.12.2)", "dnspython (>=1.16.0,<2.0)", "idna", "psutil (>=5.7.0)"] test = ["cffi (>=1.12.2)", "coverage (>=5.0)", "dnspython (>=1.16.0,<2.0)", "idna", "objgraph", "psutil (>=5.7.0)", "requests"] +[[package]] +name = "ghapi" +version = "1.0.5" +description = "A python client for the GitHub API" +optional = false +python-versions = ">=3.7" +files = [ + {file = "ghapi-1.0.5-py3-none-any.whl", hash = "sha256:24a851b7a256861f173437c807701beac3857a84979067ddc25a8555868ce6dc"}, + {file = "ghapi-1.0.5.tar.gz", hash = "sha256:57f170d50d4e6cbf475d234056c54b1ea7bb917b96b0a19798f6127d8a0c40b1"}, +] + +[package.dependencies] +fastcore = ">=1.5.4" +packaging = "*" +pip = "*" + +[package.extras] +dev = ["jsonref", "matplotlib"] + [[package]] name = "gitdb" version = "4.0.11" @@ -3104,13 +3140,9 @@ files = [ {file = "lxml-5.2.2-cp36-cp36m-win_amd64.whl", hash = "sha256:edcfa83e03370032a489430215c1e7783128808fd3e2e0a3225deee278585196"}, {file = "lxml-5.2.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:28bf95177400066596cdbcfc933312493799382879da504633d16cf60bba735b"}, {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3a745cc98d504d5bd2c19b10c79c61c7c3df9222629f1b6210c0368177589fb8"}, - {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1b590b39ef90c6b22ec0be925b211298e810b4856909c8ca60d27ffbca6c12e6"}, {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b336b0416828022bfd5a2e3083e7f5ba54b96242159f83c7e3eebaec752f1716"}, - {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_28_aarch64.whl", hash = "sha256:c2faf60c583af0d135e853c86ac2735ce178f0e338a3c7f9ae8f622fd2eb788c"}, {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:4bc6cb140a7a0ad1f7bc37e018d0ed690b7b6520ade518285dc3171f7a117905"}, - {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:7ff762670cada8e05b32bf1e4dc50b140790909caa8303cfddc4d702b71ea184"}, {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:57f0a0bbc9868e10ebe874e9f129d2917750adf008fe7b9c1598c0fbbfdde6a6"}, - {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:a6d2092797b388342c1bc932077ad232f914351932353e2e8706851c870bca1f"}, {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:60499fe961b21264e17a471ec296dcbf4365fbea611bf9e303ab69db7159ce61"}, {file = "lxml-5.2.2-cp37-cp37m-win32.whl", hash = "sha256:d9b342c76003c6b9336a80efcc766748a333573abf9350f4094ee46b006ec18f"}, {file = "lxml-5.2.2-cp37-cp37m-win_amd64.whl", hash = "sha256:b16db2770517b8799c79aa80f4053cd6f8b716f21f8aca962725a9565ce3ee40"}, @@ -4508,6 +4540,17 @@ tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "pa typing = ["typing-extensions"] xmp = ["defusedxml"] +[[package]] +name = "pip" +version = "24.0" +description = "The PyPA recommended tool for installing Python packages." +optional = false +python-versions = ">=3.7" +files = [ + {file = "pip-24.0-py3-none-any.whl", hash = "sha256:ba0d021a166865d2265246961bec0152ff124de910c5cc39f1156ce3fa7c69dc"}, + {file = "pip-24.0.tar.gz", hash = "sha256:ea9bd1a847e8c5774a5777bb398c19e80bcd4e2aa16a4b301b718fe6f593aba2"}, +] + [[package]] name = "platformdirs" version = "4.2.2" @@ -6178,6 +6221,32 @@ files = [ {file = "striprtf-0.0.26.tar.gz", hash = "sha256:fdb2bba7ac440072d1c41eab50d8d74ae88f60a8b6575c6e2c7805dc462093aa"}, ] +[[package]] +name = "swebench" +version = "1.1.5" +description = "The official SWE-bench package - a benchmark for evaluating LMs on software engineering" +optional = false +python-versions = ">=3.8" +files = [] +develop = false + +[package.dependencies] +beautifulsoup4 = "*" +chardet = "*" +datasets = "*" +ghapi = "*" +GitPython = "*" +python-dotenv = "*" +requests = "*" +rich = "*" +tqdm = "*" + +[package.source] +type = "git" +url = "https://github.com/OpenDevin/SWE-bench.git" +reference = "HEAD" +resolved_reference = "7b0c4b1c249ed4b4600a5bba8afb916d543e034a" + [[package]] name = "sympy" version = "1.12" @@ -7560,4 +7629,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "35c94ddfb97929402ce2094be893045d7cecd9c56517b2fce271094e5cebec2d" +content-hash = "6ecc369caf1256f86a6cfb642213180173c011eb6de7ffecac002ce5d0b4a661" diff --git a/pyproject.toml b/pyproject.toml index 9e5ac4cb0d..969addd4b9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -67,11 +67,13 @@ reportlab = "*" [tool.coverage.run] concurrency = ["gevent"] + [tool.poetry.group.evaluation.dependencies] streamlit = "*" whatthepatch = "*" retry = "*" evaluate = "*" +swebench = { git = "https://github.com/OpenDevin/SWE-bench.git" } [build-system] build-backend = "poetry.core.masonry.api"