diff --git a/evaluation/benchmarks/multi_swe_bench/scripts/rollout_multi_swegym.sh b/evaluation/benchmarks/multi_swe_bench/scripts/rollout_multi_swegym.sh index 826bdcdfbc..ed132432e3 100755 --- a/evaluation/benchmarks/multi_swe_bench/scripts/rollout_multi_swegym.sh +++ b/evaluation/benchmarks/multi_swe_bench/scripts/rollout_multi_swegym.sh @@ -111,15 +111,10 @@ for run_idx in $(seq 1 $N_RUNS); do echo "### Evaluating on $OUTPUT_FILE ... ###" OUTPUT_CONFIG_FILE="${OUTPUT_FILE%.jsonl}_config.json" export EVAL_SKIP_BUILD_ERRORS=true - pip install multi-swe-bench --quiet --disable-pip-version-check > /dev/null 2>&1 COMMAND="poetry run python ./evaluation/benchmarks/multi_swe_bench/scripts/eval/update_multi_swe_bench_config.py --input $OUTPUT_FILE --output $OUTPUT_CONFIG_FILE --dataset $EVAL_DATASET; - python -m multi_swe_bench.harness.run_evaluation --config $OUTPUT_CONFIG_FILE + poetry run python -m multi_swe_bench.harness.run_evaluation --config $OUTPUT_CONFIG_FILE " - if [ -n "$EVAL_LIMIT" ]; then - echo "EVAL_LIMIT: $EVAL_LIMIT" - COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT" - fi echo "Running command: $COMMAND" # Run the command eval $COMMAND diff --git a/poetry.lock b/poetry.lock index d4d11d8d28..7c4ecdf4f0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.4 and should not be changed by hand. [[package]] name = "aiofiles" @@ -1425,7 +1425,7 @@ version = "1.17.1" description = "Foreign Function Interface for Python calling C code." optional = false python-versions = ">=3.8" -groups = ["main", "runtime", "test"] +groups = ["main", "evaluation", "runtime", "test"] files = [ {file = "cffi-1.17.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:df8b1c11f177bc2313ec4b2d46baec87a5f3e71fc8b45dab2ee7cae86d9aba14"}, {file = "cffi-1.17.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8f2cdc858323644ab277e9bb925ad72ae0e67f69e804f4898c070998d50b1a67"}, @@ -1930,7 +1930,7 @@ version = "45.0.3" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." optional = false python-versions = "!=3.9.0,!=3.9.1,>=3.7" -groups = ["main"] +groups = ["main", "evaluation"] files = [ {file = "cryptography-45.0.3-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:7573d9eebaeceeb55285205dbbb8753ac1e962af3d9640791d12b36864065e71"}, {file = "cryptography-45.0.3-cp311-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d377dde61c5d67eb4311eace661c3efda46c62113ff56bf05e2d679e02aebb5b"}, @@ -2212,7 +2212,7 @@ version = "1.2.18" description = "Python @deprecated decorator to deprecate old python classes, functions or methods." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7" -groups = ["main"] +groups = ["main", "evaluation"] files = [ {file = "Deprecated-1.2.18-py2.py3-none-any.whl", hash = "sha256:bd5011788200372a32418f888e326a09ff80d0214bd961147cfed01b5c018eec"}, {file = "deprecated-1.2.18.tar.gz", hash = "sha256:422b6f6d859da6f2ef57857761bfb392480502a64c3028ca9bbe86085d72115d"}, @@ -5500,8 +5500,11 @@ files = [ {file = "lxml-5.4.0-cp36-cp36m-win_amd64.whl", hash = "sha256:7ce1a171ec325192c6a636b64c94418e71a1964f56d002cc28122fceff0b6121"}, {file = "lxml-5.4.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:795f61bcaf8770e1b37eec24edf9771b307df3af74d1d6f27d812e15a9ff3872"}, {file = "lxml-5.4.0-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:29f451a4b614a7b5b6c2e043d7b64a15bd8304d7e767055e8ab68387a8cacf4e"}, + {file = "lxml-5.4.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:891f7f991a68d20c75cb13c5c9142b2a3f9eb161f1f12a9489c82172d1f133c0"}, {file = "lxml-5.4.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4aa412a82e460571fad592d0f93ce9935a20090029ba08eca05c614f99b0cc92"}, + {file = "lxml-5.4.0-cp37-cp37m-manylinux_2_28_aarch64.whl", hash = "sha256:ac7ba71f9561cd7d7b55e1ea5511543c0282e2b6450f122672a2694621d63b7e"}, {file = "lxml-5.4.0-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:c5d32f5284012deaccd37da1e2cd42f081feaa76981f0eaa474351b68df813c5"}, + {file = "lxml-5.4.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:ce31158630a6ac85bddd6b830cffd46085ff90498b397bd0a259f59d27a12188"}, {file = "lxml-5.4.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:31e63621e073e04697c1b2d23fcb89991790eef370ec37ce4d5d469f40924ed6"}, {file = "lxml-5.4.0-cp37-cp37m-win32.whl", hash = "sha256:be2ba4c3c5b7900246a8f866580700ef0d538f2ca32535e991027bdaba944063"}, {file = "lxml-5.4.0-cp37-cp37m-win_amd64.whl", hash = "sha256:09846782b1ef650b321484ad429217f5154da4d6e786636c38e434fa32e94e49"}, @@ -6014,6 +6017,28 @@ files = [ {file = "msgpack-1.1.0.tar.gz", hash = "sha256:dd432ccc2c72b914e4cb77afce64aab761c1137cc698be3984eee260bcb2896e"}, ] +[[package]] +name = "multi-swe-bench" +version = "0.1.2" +description = "Multi-SWE-bench: A Multilingual Benchmark for Issue Resolving" +optional = false +python-versions = ">=3.10" +groups = ["evaluation"] +files = [ + {file = "multi_swe_bench-0.1.2-py3-none-any.whl", hash = "sha256:6e6cab26c026a3038109bdda7ea4366333cd210a0785bb138044f8917842e1d0"}, + {file = "multi_swe_bench-0.1.2.tar.gz", hash = "sha256:ff78cce060a9483e90d571872eaf8625447be3054f4ddf8fae0ec9ea9b9f056a"}, +] + +[package.dependencies] +dataclasses_json = "*" +docker = "*" +gitpython = "*" +PyGithub = "*" +pyyaml = "*" +toml = "*" +tqdm = "*" +unidiff = "*" + [[package]] name = "multidict" version = "6.4.4" @@ -8084,7 +8109,7 @@ version = "2.22" description = "C parser in Python" optional = false python-versions = ">=3.8" -groups = ["main", "runtime", "test"] +groups = ["main", "evaluation", "runtime", "test"] files = [ {file = "pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc"}, {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"}, @@ -8318,7 +8343,7 @@ version = "2.6.1" description = "Use the full Github API v3" optional = false python-versions = ">=3.8" -groups = ["main"] +groups = ["main", "evaluation"] files = [ {file = "PyGithub-2.6.1-py3-none-any.whl", hash = "sha256:6f2fa6d076ccae475f9fc392cc6cdbd54db985d4f69b8833a28397de75ed6ca3"}, {file = "pygithub-2.6.1.tar.gz", hash = "sha256:b5c035392991cca63959e9453286b41b54d83bf2de2daa7d7ff7e4312cebf3bf"}, @@ -8353,7 +8378,7 @@ version = "2.10.1" description = "JSON Web Token implementation in Python" optional = false python-versions = ">=3.9" -groups = ["main"] +groups = ["main", "evaluation"] files = [ {file = "PyJWT-2.10.1-py3-none-any.whl", hash = "sha256:dcdd193e30abefd5debf142f9adfcdd2b58004e644f25406ffaebd50bd98dacb"}, {file = "pyjwt-2.10.1.tar.gz", hash = "sha256:3cc5772eb20009233caf06e9d8a0577824723b44e6648ee0a2aedb6cf9381953"}, @@ -8385,7 +8410,7 @@ version = "1.5.0" description = "Python binding to the Networking and Cryptography (NaCl) library" optional = false python-versions = ">=3.6" -groups = ["main"] +groups = ["main", "evaluation"] files = [ {file = "PyNaCl-1.5.0-cp36-abi3-macosx_10_10_universal2.whl", hash = "sha256:401002a4aaa07c9414132aaed7f6836ff98f59277a234704ff66878c2ee4a0d1"}, {file = "PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:52cb72a79269189d4e0dc537556f4740f7f0a9ec41c1322598799b0bdad4ef92"}, @@ -11888,7 +11913,7 @@ version = "1.17.2" description = "Module for decorators, wrappers and monkey patching." optional = false python-versions = ">=3.8" -groups = ["main"] +groups = ["main", "evaluation"] files = [ {file = "wrapt-1.17.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3d57c572081fed831ad2d26fd430d565b76aa277ed1d30ff4d40670b1c0dd984"}, {file = "wrapt-1.17.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b5e251054542ae57ac7f3fba5d10bfff615b6c2fb09abeb37d2f1463f841ae22"}, @@ -12583,4 +12608,4 @@ third-party-runtimes = ["daytona", "e2b-code-interpreter", "modal", "runloop-api [metadata] lock-version = "2.1" python-versions = "^3.12,<3.14" -content-hash = "056952164091353e4fd14ac7710b1c74bc2e59511578afecf279fc170be1c68e" +content-hash = "6973cf1c9ccb0d6926006697ae4863bcd0ab740a88f09fc17c205e016782cf77" diff --git a/pyproject.toml b/pyproject.toml index 1b04f29594..5f587d382f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -186,6 +186,7 @@ pyarrow = "21.0.0" datasets = "*" joblib = "*" swebench = { git = "https://github.com/ryanhoangt/SWE-bench.git", rev = "fix-modal-patch-eval" } +multi-swe-bench = "0.1.2" [tool.poetry.scripts] openhands = "openhands.cli.entry:main"