[eval] Upgrade SWE-Bench to use official image and latest harness (#6838)

Co-authored-by: Robert Brennan <accounts@rbren.io>
Co-authored-by: openhands <openhands@all-hands.dev>
Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
Co-authored-by: Graham Neubig <neubig@gmail.com>
This commit is contained in:
Xingyao Wang 2025-02-27 08:15:05 -05:00 committed by GitHub
parent 0137600988
commit 33780f97d0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 3267 additions and 2778 deletions

View File

@ -11,7 +11,11 @@ from swebench.harness.run_evaluation import (
APPLY_PATCH_FAIL,
APPLY_PATCH_PASS,
)
from swebench.harness.test_spec import SWEbenchInstance, TestSpec, make_test_spec
from swebench.harness.test_spec.test_spec import (
SWEbenchInstance,
TestSpec,
make_test_spec,
)
from swebench.harness.utils import load_swebench_dataset
from tqdm import tqdm

View File

@ -58,8 +58,6 @@ def _get_swebench_workspace_dir_name(instance: pd.Series) -> str:
def get_instruction(instance: pd.Series, metadata: EvalMetadata):
workspace_dir_name = _get_swebench_workspace_dir_name(instance)
# Prepare instruction
# Instruction based on Anthropic's official trajectory
# https://github.com/eschluntz/swe-bench-experiments/tree/main/evaluation/verified/20241022_tools_claude-3-5-sonnet-updated/trajs
instruction = (
@ -71,14 +69,20 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata):
f'{instance.problem_statement}\n'
'</issue_description>\n\n'
'Can you help me implement the necessary changes to the repository so that the requirements specified in the <issue_description> are met?\n'
"I've already taken care of all changes to any of the test files described in the <pr_description>. This means you DON'T have to modify the testing logic or any of the tests in any way!\n"
'Your task is to make the minimal changes to non-tests files in the /workspace directory to ensure the <pr_description> is satisfied.\n'
"I've already taken care of all changes to any of the test files described in the <issue_description>. This means you DON'T have to modify the testing logic or any of the tests in any way!\n"
"Also the development Python environment is already set up for you (i.e., all dependencies already installed), so you don't need to install other packages.\n"
'Your task is to make the minimal changes to non-test files in the /workspace directory to ensure the <issue_description> is satisfied.\n'
'Follow these steps to resolve the issue:\n'
'1. As a first step, it might be a good idea to explore the repo to familiarize yourself with its structure.\n'
'2. Create a script to reproduce the error and execute it with `python <filename.py>` using the BashTool, to confirm the error\n'
'3. Edit the sourcecode of the repo to resolve the issue\n'
'4. Rerun your reproduce script and confirm that the error is fixed!\n'
'5. Think about edgecases and make sure your fix handles them as well\n'
'5. Think about edgecases, add comprehensive tests for them in your reproduce script, and run them to make sure your fix handles them as well\n'
f'6. Once you are done with the initial implementation, please carefully re-read the problem description and check the difference between the current code and the base commit {instance["base_commit"]}. Do you think that the issue has been completely and comprehensively solved? Write tests to check the correctness of the solution, specifically focusing on tests that may point out any remaining problems that are not yet solved. Run all of the tests in the repo and check if any of them fail, and if they do fix the code. Repeat this process of carefully reading the problem description and current implementation, testing, and fixing any problems until you are confident that the current implementation is correct. Find and run any tests in the repo that are related to:\n'
' - The issue you are fixing\n'
' - The files you modified\n'
' - The functions you changed\n'
' Make sure all these tests pass with your changes.\n'
"Your thinking should be thorough and so it's fine if it's very long.\n"
)
@ -96,11 +100,19 @@ DOCKER_IMAGE_PREFIX = os.environ.get('EVAL_DOCKER_IMAGE_PREFIX', 'docker.io/xing
logger.info(f'Using docker image prefix: {DOCKER_IMAGE_PREFIX}')
def get_instance_docker_image(instance_id: str) -> str:
image_name = 'sweb.eval.x86_64.' + instance_id
image_name = image_name.replace(
'__', '_s_'
) # to comply with docker image naming convention
def get_instance_docker_image(instance_id: str, official_image: bool = False) -> str:
if official_image:
# Official SWE-Bench image
# swebench/sweb.eval.x86_64.django_1776_django-11333:v1
repo, name = instance_id.split('__')
image_name = f'sweb.eval.x86_64.{repo}_1776_{name}:latest'
logger.warning(f'Using official SWE-Bench image: {image_name}')
else:
# OpenHands version of the image
image_name = 'sweb.eval.x86_64.' + instance_id
image_name = image_name.replace(
'__', '_s_'
) # to comply with docker image naming convention
return (DOCKER_IMAGE_PREFIX.rstrip('/') + '/' + image_name).lower()
@ -111,7 +123,12 @@ def get_config(
SWE_BENCH_CONTAINER_IMAGE = 'ghcr.io/opendevin/eval-swe-bench:full-v1.2.1'
if USE_INSTANCE_IMAGE:
# We use a different instance image for the each instance of swe-bench eval
base_container_image = get_instance_docker_image(instance['instance_id'])
use_official_image = bool(
'verified' in metadata.dataset.lower() or 'lite' in metadata.dataset.lower()
)
base_container_image = get_instance_docker_image(
instance['instance_id'], use_official_image
)
logger.info(
f'Using instance container image: {base_container_image}. '
f'Please make sure this image exists. '

View File

@ -1,336 +1,300 @@
sweb.base.x86_64:latest
sweb.env.x86_64.088a7e628bda9770f9757b:latest
sweb.env.x86_64.0d80c7dec81ee2f2f513e2:latest
sweb.env.x86_64.0f99bce2750f3109957bec:latest
sweb.env.x86_64.1b3b218535da0abf4469cb:latest
sweb.env.x86_64.1c1a6945f732f9391228c5:latest
sweb.env.x86_64.1f92e6d7cef88badc4f744:latest
sweb.env.x86_64.27dd9791e13f5c857a09f9:latest
sweb.env.x86_64.297af196949a2a635bce66:latest
sweb.env.x86_64.2baaea72acc974f6c02079:latest
sweb.env.x86_64.2e50125951bc69cddd7421:latest
sweb.env.x86_64.2f217c8b4490bfa0e2ba14:latest
sweb.env.x86_64.31244378a92e3bcce809ac:latest
sweb.env.x86_64.428468730904ff6b4232aa:latest
sweb.env.x86_64.5d1fda9d55d65d8a4e5bdb:latest
sweb.env.x86_64.6b007979cf533f0f3016e8:latest
sweb.env.x86_64.7037e8c448a4b8ebfe9b13:latest
sweb.env.x86_64.71498c7426dbf05599642f:latest
sweb.env.x86_64.756beac07713d7e8dc1129:latest
sweb.env.x86_64.78278ae2cf880e395f1337:latest
sweb.env.x86_64.8f1f7b974f0c57c7aeba39:latest
sweb.env.x86_64.934a137824256b612e9dc5:latest
sweb.env.x86_64.a0efca7a0fe6719dbf65c2:latest
sweb.env.x86_64.a18371b03f944585b4f08c:latest
sweb.env.x86_64.a33dddf55cdff5d8e23374:latest
sweb.env.x86_64.aa92880033da20ca313928:latest
sweb.env.x86_64.b649f0ff62fad147f7f073:latest
sweb.env.x86_64.b7ce4be3b3c35f68c61248:latest
sweb.env.x86_64.c70909fdac4897d1c685df:latest
sweb.env.x86_64.c795f4b88616b8462021ed:latest
sweb.env.x86_64.cc47cc71483942d0c3a15e:latest
sweb.env.x86_64.dc5ff4c0e3fe8db5afc4da:latest
sweb.env.x86_64.e3afd7f04b325a4de4982d:latest
sweb.env.x86_64.e5bb89bf78258a7d14c34b:latest
sweb.env.x86_64.e83e37f52c09532c62acfb:latest
sweb.env.x86_64.efa6065ed5bf204410fd53:latest
sweb.eval.x86_64.astropy_s_astropy-12907:latest
sweb.eval.x86_64.astropy_s_astropy-14182:latest
sweb.eval.x86_64.astropy_s_astropy-14365:latest
sweb.eval.x86_64.astropy_s_astropy-14995:latest
sweb.eval.x86_64.astropy_s_astropy-6938:latest
sweb.eval.x86_64.astropy_s_astropy-7746:latest
sweb.eval.x86_64.django_s_django-10914:latest
sweb.eval.x86_64.django_s_django-10924:latest
sweb.eval.x86_64.django_s_django-11001:latest
sweb.eval.x86_64.django_s_django-11019:latest
sweb.eval.x86_64.django_s_django-11039:latest
sweb.eval.x86_64.django_s_django-11049:latest
sweb.eval.x86_64.django_s_django-11099:latest
sweb.eval.x86_64.django_s_django-11133:latest
sweb.eval.x86_64.django_s_django-11179:latest
sweb.eval.x86_64.django_s_django-11283:latest
sweb.eval.x86_64.django_s_django-11422:latest
sweb.eval.x86_64.django_s_django-11564:latest
sweb.eval.x86_64.django_s_django-11583:latest
sweb.eval.x86_64.django_s_django-11620:latest
sweb.eval.x86_64.django_s_django-11630:latest
sweb.eval.x86_64.django_s_django-11742:latest
sweb.eval.x86_64.django_s_django-11797:latest
sweb.eval.x86_64.django_s_django-11815:latest
sweb.eval.x86_64.django_s_django-11848:latest
sweb.eval.x86_64.django_s_django-11905:latest
sweb.eval.x86_64.django_s_django-11910:latest
sweb.eval.x86_64.django_s_django-11964:latest
sweb.eval.x86_64.django_s_django-11999:latest
sweb.eval.x86_64.django_s_django-12113:latest
sweb.eval.x86_64.django_s_django-12125:latest
sweb.eval.x86_64.django_s_django-12184:latest
sweb.eval.x86_64.django_s_django-12284:latest
sweb.eval.x86_64.django_s_django-12286:latest
sweb.eval.x86_64.django_s_django-12308:latest
sweb.eval.x86_64.django_s_django-12453:latest
sweb.eval.x86_64.django_s_django-12470:latest
sweb.eval.x86_64.django_s_django-12497:latest
sweb.eval.x86_64.django_s_django-12589:latest
sweb.eval.x86_64.django_s_django-12700:latest
sweb.eval.x86_64.django_s_django-12708:latest
sweb.eval.x86_64.django_s_django-12747:latest
sweb.eval.x86_64.django_s_django-12856:latest
sweb.eval.x86_64.django_s_django-12908:latest
sweb.eval.x86_64.django_s_django-12915:latest
sweb.eval.x86_64.django_s_django-12983:latest
sweb.eval.x86_64.django_s_django-13028:latest
sweb.eval.x86_64.django_s_django-13033:latest
sweb.eval.x86_64.django_s_django-13158:latest
sweb.eval.x86_64.django_s_django-13220:latest
sweb.eval.x86_64.django_s_django-13230:latest
sweb.eval.x86_64.django_s_django-13265:latest
sweb.eval.x86_64.django_s_django-13315:latest
sweb.eval.x86_64.django_s_django-13321:latest
sweb.eval.x86_64.django_s_django-13401:latest
sweb.eval.x86_64.django_s_django-13447:latest
sweb.eval.x86_64.django_s_django-13448:latest
sweb.eval.x86_64.django_s_django-13551:latest
sweb.eval.x86_64.django_s_django-13590:latest
sweb.eval.x86_64.django_s_django-13658:latest
sweb.eval.x86_64.django_s_django-13660:latest
sweb.eval.x86_64.django_s_django-13710:latest
sweb.eval.x86_64.django_s_django-13757:latest
sweb.eval.x86_64.django_s_django-13768:latest
sweb.eval.x86_64.django_s_django-13925:latest
sweb.eval.x86_64.django_s_django-13933:latest
sweb.eval.x86_64.django_s_django-13964:latest
sweb.eval.x86_64.django_s_django-14016:latest
sweb.eval.x86_64.django_s_django-14017:latest
sweb.eval.x86_64.django_s_django-14155:latest
sweb.eval.x86_64.django_s_django-14238:latest
sweb.eval.x86_64.django_s_django-14382:latest
sweb.eval.x86_64.django_s_django-14411:latest
sweb.eval.x86_64.django_s_django-14534:latest
sweb.eval.x86_64.django_s_django-14580:latest
sweb.eval.x86_64.django_s_django-14608:latest
sweb.eval.x86_64.django_s_django-14667:latest
sweb.eval.x86_64.django_s_django-14672:latest
sweb.eval.x86_64.django_s_django-14730:latest
sweb.eval.x86_64.django_s_django-14752:latest
sweb.eval.x86_64.django_s_django-14787:latest
sweb.eval.x86_64.django_s_django-14855:latest
sweb.eval.x86_64.django_s_django-14915:latest
sweb.eval.x86_64.django_s_django-14997:latest
sweb.eval.x86_64.django_s_django-14999:latest
sweb.eval.x86_64.django_s_django-15061:latest
sweb.eval.x86_64.django_s_django-15202:latest
sweb.eval.x86_64.django_s_django-15213:latest
sweb.eval.x86_64.django_s_django-15252:latest
sweb.eval.x86_64.django_s_django-15320:latest
sweb.eval.x86_64.django_s_django-15347:latest
sweb.eval.x86_64.django_s_django-15388:latest
sweb.eval.x86_64.django_s_django-15400:latest
sweb.eval.x86_64.django_s_django-15498:latest
sweb.eval.x86_64.django_s_django-15695:latest
sweb.eval.x86_64.django_s_django-15738:latest
sweb.eval.x86_64.django_s_django-15781:latest
sweb.eval.x86_64.django_s_django-15789:latest
sweb.eval.x86_64.django_s_django-15790:latest
sweb.eval.x86_64.django_s_django-15814:latest
sweb.eval.x86_64.django_s_django-15819:latest
sweb.eval.x86_64.django_s_django-15851:latest
sweb.eval.x86_64.django_s_django-15902:latest
sweb.eval.x86_64.django_s_django-15996:latest
sweb.eval.x86_64.django_s_django-16041:latest
sweb.eval.x86_64.django_s_django-16046:latest
sweb.eval.x86_64.django_s_django-16139:latest
sweb.eval.x86_64.django_s_django-16229:latest
sweb.eval.x86_64.django_s_django-16255:latest
sweb.eval.x86_64.django_s_django-16379:latest
sweb.eval.x86_64.django_s_django-16400:latest
sweb.eval.x86_64.django_s_django-16408:latest
sweb.eval.x86_64.django_s_django-16527:latest
sweb.eval.x86_64.django_s_django-16595:latest
sweb.eval.x86_64.django_s_django-16816:latest
sweb.eval.x86_64.django_s_django-16820:latest
sweb.eval.x86_64.django_s_django-16873:latest
sweb.eval.x86_64.django_s_django-16910:latest
sweb.eval.x86_64.django_s_django-17051:latest
sweb.eval.x86_64.django_s_django-17087:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-18869:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-22711:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-22835:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-23299:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-23314:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-23476:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-23562:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-23563:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-23913:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-23964:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-23987:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-24149:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-24265:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-24334:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-24970:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-25079:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-25311:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-25332:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-25433:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-25442:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-25498:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-26011:latest
sweb.eval.x86_64.matplotlib_s_matplotlib-26020:latest
sweb.eval.x86_64.mwaskom_s_seaborn-2848:latest
sweb.eval.x86_64.mwaskom_s_seaborn-3010:latest
sweb.eval.x86_64.mwaskom_s_seaborn-3190:latest
sweb.eval.x86_64.mwaskom_s_seaborn-3407:latest
sweb.eval.x86_64.pallets_s_flask-4045:latest
sweb.eval.x86_64.pallets_s_flask-4992:latest
sweb.eval.x86_64.pallets_s_flask-5063:latest
sweb.eval.x86_64.psf_s_requests-1963:latest
sweb.eval.x86_64.psf_s_requests-2148:latest
sweb.eval.x86_64.psf_s_requests-2317:latest
sweb.eval.x86_64.psf_s_requests-2674:latest
sweb.eval.x86_64.psf_s_requests-3362:latest
sweb.eval.x86_64.psf_s_requests-863:latest
sweb.eval.x86_64.pydata_s_xarray-3364:latest
sweb.eval.x86_64.pydata_s_xarray-4094:latest
sweb.eval.x86_64.pydata_s_xarray-4248:latest
sweb.eval.x86_64.pydata_s_xarray-4493:latest
sweb.eval.x86_64.pydata_s_xarray-5131:latest
sweb.eval.x86_64.pylint-dev_s_pylint-5859:latest
sweb.eval.x86_64.pylint-dev_s_pylint-6506:latest
sweb.eval.x86_64.pylint-dev_s_pylint-7080:latest
sweb.eval.x86_64.pylint-dev_s_pylint-7114:latest
sweb.eval.x86_64.pylint-dev_s_pylint-7228:latest
sweb.eval.x86_64.pylint-dev_s_pylint-7993:latest
sweb.eval.x86_64.pytest-dev_s_pytest-11143:latest
sweb.eval.x86_64.pytest-dev_s_pytest-11148:latest
sweb.eval.x86_64.pytest-dev_s_pytest-5103:latest
sweb.eval.x86_64.pytest-dev_s_pytest-5221:latest
sweb.eval.x86_64.pytest-dev_s_pytest-5227:latest
sweb.eval.x86_64.pytest-dev_s_pytest-5413:latest
sweb.eval.x86_64.pytest-dev_s_pytest-5495:latest
sweb.eval.x86_64.pytest-dev_s_pytest-5692:latest
sweb.eval.x86_64.pytest-dev_s_pytest-6116:latest
sweb.eval.x86_64.pytest-dev_s_pytest-7168:latest
sweb.eval.x86_64.pytest-dev_s_pytest-7220:latest
sweb.eval.x86_64.pytest-dev_s_pytest-7373:latest
sweb.eval.x86_64.pytest-dev_s_pytest-7432:latest
sweb.eval.x86_64.pytest-dev_s_pytest-7490:latest
sweb.eval.x86_64.pytest-dev_s_pytest-8365:latest
sweb.eval.x86_64.pytest-dev_s_pytest-8906:latest
sweb.eval.x86_64.pytest-dev_s_pytest-9359:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-10297:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-10508:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-10949:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-11040:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-11281:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-12471:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-13142:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-13241:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-13439:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-13496:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-13497:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-13584:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-13779:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-14087:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-14092:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-14894:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-14983:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-15512:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-15535:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-25500:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-25570:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-25638:latest
sweb.eval.x86_64.scikit-learn_s_scikit-learn-25747:latest
sweb.eval.x86_64.sphinx-doc_s_sphinx-10325:latest
sweb.eval.x86_64.sphinx-doc_s_sphinx-10451:latest
sweb.eval.x86_64.sphinx-doc_s_sphinx-11445:latest
sweb.eval.x86_64.sphinx-doc_s_sphinx-7686:latest
sweb.eval.x86_64.sphinx-doc_s_sphinx-7738:latest
sweb.eval.x86_64.sphinx-doc_s_sphinx-7975:latest
sweb.eval.x86_64.sphinx-doc_s_sphinx-8273:latest
sweb.eval.x86_64.sphinx-doc_s_sphinx-8282:latest
sweb.eval.x86_64.sphinx-doc_s_sphinx-8435:latest
sweb.eval.x86_64.sphinx-doc_s_sphinx-8474:latest
sweb.eval.x86_64.sphinx-doc_s_sphinx-8506:latest
sweb.eval.x86_64.sphinx-doc_s_sphinx-8595:latest
sweb.eval.x86_64.sphinx-doc_s_sphinx-8627:latest
sweb.eval.x86_64.sphinx-doc_s_sphinx-8713:latest
sweb.eval.x86_64.sphinx-doc_s_sphinx-8721:latest
sweb.eval.x86_64.sphinx-doc_s_sphinx-8801:latest
sweb.eval.x86_64.sympy_s_sympy-11400:latest
sweb.eval.x86_64.sympy_s_sympy-11870:latest
sweb.eval.x86_64.sympy_s_sympy-11897:latest
sweb.eval.x86_64.sympy_s_sympy-12171:latest
sweb.eval.x86_64.sympy_s_sympy-12236:latest
sweb.eval.x86_64.sympy_s_sympy-12419:latest
sweb.eval.x86_64.sympy_s_sympy-12454:latest
sweb.eval.x86_64.sympy_s_sympy-12481:latest
sweb.eval.x86_64.sympy_s_sympy-13031:latest
sweb.eval.x86_64.sympy_s_sympy-13043:latest
sweb.eval.x86_64.sympy_s_sympy-13146:latest
sweb.eval.x86_64.sympy_s_sympy-13177:latest
sweb.eval.x86_64.sympy_s_sympy-13437:latest
sweb.eval.x86_64.sympy_s_sympy-13471:latest
sweb.eval.x86_64.sympy_s_sympy-13480:latest
sweb.eval.x86_64.sympy_s_sympy-13647:latest
sweb.eval.x86_64.sympy_s_sympy-13773:latest
sweb.eval.x86_64.sympy_s_sympy-13895:latest
sweb.eval.x86_64.sympy_s_sympy-13915:latest
sweb.eval.x86_64.sympy_s_sympy-13971:latest
sweb.eval.x86_64.sympy_s_sympy-14024:latest
sweb.eval.x86_64.sympy_s_sympy-14308:latest
sweb.eval.x86_64.sympy_s_sympy-14317:latest
sweb.eval.x86_64.sympy_s_sympy-14396:latest
sweb.eval.x86_64.sympy_s_sympy-14774:latest
sweb.eval.x86_64.sympy_s_sympy-14817:latest
sweb.eval.x86_64.sympy_s_sympy-15011:latest
sweb.eval.x86_64.sympy_s_sympy-15308:latest
sweb.eval.x86_64.sympy_s_sympy-15345:latest
sweb.eval.x86_64.sympy_s_sympy-15346:latest
sweb.eval.x86_64.sympy_s_sympy-15609:latest
sweb.eval.x86_64.sympy_s_sympy-15678:latest
sweb.eval.x86_64.sympy_s_sympy-16106:latest
sweb.eval.x86_64.sympy_s_sympy-16281:latest
sweb.eval.x86_64.sympy_s_sympy-16503:latest
sweb.eval.x86_64.sympy_s_sympy-16792:latest
sweb.eval.x86_64.sympy_s_sympy-16988:latest
sweb.eval.x86_64.sympy_s_sympy-17022:latest
sweb.eval.x86_64.sympy_s_sympy-17139:latest
sweb.eval.x86_64.sympy_s_sympy-17630:latest
sweb.eval.x86_64.sympy_s_sympy-17655:latest
sweb.eval.x86_64.sympy_s_sympy-18057:latest
sweb.eval.x86_64.sympy_s_sympy-18087:latest
sweb.eval.x86_64.sympy_s_sympy-18189:latest
sweb.eval.x86_64.sympy_s_sympy-18199:latest
sweb.eval.x86_64.sympy_s_sympy-18532:latest
sweb.eval.x86_64.sympy_s_sympy-18621:latest
sweb.eval.x86_64.sympy_s_sympy-18698:latest
sweb.eval.x86_64.sympy_s_sympy-18835:latest
sweb.eval.x86_64.sympy_s_sympy-19007:latest
sweb.eval.x86_64.sympy_s_sympy-19254:latest
sweb.eval.x86_64.sympy_s_sympy-19487:latest
sweb.eval.x86_64.sympy_s_sympy-20049:latest
sweb.eval.x86_64.sympy_s_sympy-20154:latest
sweb.eval.x86_64.sympy_s_sympy-20212:latest
sweb.eval.x86_64.sympy_s_sympy-20322:latest
sweb.eval.x86_64.sympy_s_sympy-20442:latest
sweb.eval.x86_64.sympy_s_sympy-20590:latest
sweb.eval.x86_64.sympy_s_sympy-20639:latest
sweb.eval.x86_64.sympy_s_sympy-21055:latest
sweb.eval.x86_64.sympy_s_sympy-21171:latest
sweb.eval.x86_64.sympy_s_sympy-21379:latest
sweb.eval.x86_64.sympy_s_sympy-21612:latest
sweb.eval.x86_64.sympy_s_sympy-21614:latest
sweb.eval.x86_64.sympy_s_sympy-21627:latest
sweb.eval.x86_64.sympy_s_sympy-21847:latest
sweb.eval.x86_64.sympy_s_sympy-22005:latest
sweb.eval.x86_64.sympy_s_sympy-22714:latest
sweb.eval.x86_64.sympy_s_sympy-22840:latest
sweb.eval.x86_64.sympy_s_sympy-23117:latest
sweb.eval.x86_64.sympy_s_sympy-23191:latest
sweb.eval.x86_64.sympy_s_sympy-23262:latest
sweb.eval.x86_64.sympy_s_sympy-24066:latest
sweb.eval.x86_64.sympy_s_sympy-24102:latest
sweb.eval.x86_64.sympy_s_sympy-24152:latest
sweb.eval.x86_64.sympy_s_sympy-24213:latest
sweb.eval.x86_64.sympy_s_sympy-24909:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-12907:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-14182:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-14365:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-14995:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-6938:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-7746:latest
swebench/sweb.eval.x86_64.django_1776_django-10914:latest
swebench/sweb.eval.x86_64.django_1776_django-10924:latest
swebench/sweb.eval.x86_64.django_1776_django-11001:latest
swebench/sweb.eval.x86_64.django_1776_django-11019:latest
swebench/sweb.eval.x86_64.django_1776_django-11039:latest
swebench/sweb.eval.x86_64.django_1776_django-11049:latest
swebench/sweb.eval.x86_64.django_1776_django-11099:latest
swebench/sweb.eval.x86_64.django_1776_django-11133:latest
swebench/sweb.eval.x86_64.django_1776_django-11179:latest
swebench/sweb.eval.x86_64.django_1776_django-11283:latest
swebench/sweb.eval.x86_64.django_1776_django-11422:latest
swebench/sweb.eval.x86_64.django_1776_django-11564:latest
swebench/sweb.eval.x86_64.django_1776_django-11583:latest
swebench/sweb.eval.x86_64.django_1776_django-11620:latest
swebench/sweb.eval.x86_64.django_1776_django-11630:latest
swebench/sweb.eval.x86_64.django_1776_django-11742:latest
swebench/sweb.eval.x86_64.django_1776_django-11797:latest
swebench/sweb.eval.x86_64.django_1776_django-11815:latest
swebench/sweb.eval.x86_64.django_1776_django-11848:latest
swebench/sweb.eval.x86_64.django_1776_django-11905:latest
swebench/sweb.eval.x86_64.django_1776_django-11910:latest
swebench/sweb.eval.x86_64.django_1776_django-11964:latest
swebench/sweb.eval.x86_64.django_1776_django-11999:latest
swebench/sweb.eval.x86_64.django_1776_django-12113:latest
swebench/sweb.eval.x86_64.django_1776_django-12125:latest
swebench/sweb.eval.x86_64.django_1776_django-12184:latest
swebench/sweb.eval.x86_64.django_1776_django-12284:latest
swebench/sweb.eval.x86_64.django_1776_django-12286:latest
swebench/sweb.eval.x86_64.django_1776_django-12308:latest
swebench/sweb.eval.x86_64.django_1776_django-12453:latest
swebench/sweb.eval.x86_64.django_1776_django-12470:latest
swebench/sweb.eval.x86_64.django_1776_django-12497:latest
swebench/sweb.eval.x86_64.django_1776_django-12589:latest
swebench/sweb.eval.x86_64.django_1776_django-12700:latest
swebench/sweb.eval.x86_64.django_1776_django-12708:latest
swebench/sweb.eval.x86_64.django_1776_django-12747:latest
swebench/sweb.eval.x86_64.django_1776_django-12856:latest
swebench/sweb.eval.x86_64.django_1776_django-12908:latest
swebench/sweb.eval.x86_64.django_1776_django-12915:latest
swebench/sweb.eval.x86_64.django_1776_django-12983:latest
swebench/sweb.eval.x86_64.django_1776_django-13028:latest
swebench/sweb.eval.x86_64.django_1776_django-13033:latest
swebench/sweb.eval.x86_64.django_1776_django-13158:latest
swebench/sweb.eval.x86_64.django_1776_django-13220:latest
swebench/sweb.eval.x86_64.django_1776_django-13230:latest
swebench/sweb.eval.x86_64.django_1776_django-13265:latest
swebench/sweb.eval.x86_64.django_1776_django-13315:latest
swebench/sweb.eval.x86_64.django_1776_django-13321:latest
swebench/sweb.eval.x86_64.django_1776_django-13401:latest
swebench/sweb.eval.x86_64.django_1776_django-13447:latest
swebench/sweb.eval.x86_64.django_1776_django-13448:latest
swebench/sweb.eval.x86_64.django_1776_django-13551:latest
swebench/sweb.eval.x86_64.django_1776_django-13590:latest
swebench/sweb.eval.x86_64.django_1776_django-13658:latest
swebench/sweb.eval.x86_64.django_1776_django-13660:latest
swebench/sweb.eval.x86_64.django_1776_django-13710:latest
swebench/sweb.eval.x86_64.django_1776_django-13757:latest
swebench/sweb.eval.x86_64.django_1776_django-13768:latest
swebench/sweb.eval.x86_64.django_1776_django-13925:latest
swebench/sweb.eval.x86_64.django_1776_django-13933:latest
swebench/sweb.eval.x86_64.django_1776_django-13964:latest
swebench/sweb.eval.x86_64.django_1776_django-14016:latest
swebench/sweb.eval.x86_64.django_1776_django-14017:latest
swebench/sweb.eval.x86_64.django_1776_django-14155:latest
swebench/sweb.eval.x86_64.django_1776_django-14238:latest
swebench/sweb.eval.x86_64.django_1776_django-14382:latest
swebench/sweb.eval.x86_64.django_1776_django-14411:latest
swebench/sweb.eval.x86_64.django_1776_django-14534:latest
swebench/sweb.eval.x86_64.django_1776_django-14580:latest
swebench/sweb.eval.x86_64.django_1776_django-14608:latest
swebench/sweb.eval.x86_64.django_1776_django-14667:latest
swebench/sweb.eval.x86_64.django_1776_django-14672:latest
swebench/sweb.eval.x86_64.django_1776_django-14730:latest
swebench/sweb.eval.x86_64.django_1776_django-14752:latest
swebench/sweb.eval.x86_64.django_1776_django-14787:latest
swebench/sweb.eval.x86_64.django_1776_django-14855:latest
swebench/sweb.eval.x86_64.django_1776_django-14915:latest
swebench/sweb.eval.x86_64.django_1776_django-14997:latest
swebench/sweb.eval.x86_64.django_1776_django-14999:latest
swebench/sweb.eval.x86_64.django_1776_django-15061:latest
swebench/sweb.eval.x86_64.django_1776_django-15202:latest
swebench/sweb.eval.x86_64.django_1776_django-15213:latest
swebench/sweb.eval.x86_64.django_1776_django-15252:latest
swebench/sweb.eval.x86_64.django_1776_django-15320:latest
swebench/sweb.eval.x86_64.django_1776_django-15347:latest
swebench/sweb.eval.x86_64.django_1776_django-15388:latest
swebench/sweb.eval.x86_64.django_1776_django-15400:latest
swebench/sweb.eval.x86_64.django_1776_django-15498:latest
swebench/sweb.eval.x86_64.django_1776_django-15695:latest
swebench/sweb.eval.x86_64.django_1776_django-15738:latest
swebench/sweb.eval.x86_64.django_1776_django-15781:latest
swebench/sweb.eval.x86_64.django_1776_django-15789:latest
swebench/sweb.eval.x86_64.django_1776_django-15790:latest
swebench/sweb.eval.x86_64.django_1776_django-15814:latest
swebench/sweb.eval.x86_64.django_1776_django-15819:latest
swebench/sweb.eval.x86_64.django_1776_django-15851:latest
swebench/sweb.eval.x86_64.django_1776_django-15902:latest
swebench/sweb.eval.x86_64.django_1776_django-15996:latest
swebench/sweb.eval.x86_64.django_1776_django-16041:latest
swebench/sweb.eval.x86_64.django_1776_django-16046:latest
swebench/sweb.eval.x86_64.django_1776_django-16139:latest
swebench/sweb.eval.x86_64.django_1776_django-16229:latest
swebench/sweb.eval.x86_64.django_1776_django-16255:latest
swebench/sweb.eval.x86_64.django_1776_django-16379:latest
swebench/sweb.eval.x86_64.django_1776_django-16400:latest
swebench/sweb.eval.x86_64.django_1776_django-16408:latest
swebench/sweb.eval.x86_64.django_1776_django-16527:latest
swebench/sweb.eval.x86_64.django_1776_django-16595:latest
swebench/sweb.eval.x86_64.django_1776_django-16816:latest
swebench/sweb.eval.x86_64.django_1776_django-16820:latest
swebench/sweb.eval.x86_64.django_1776_django-16873:latest
swebench/sweb.eval.x86_64.django_1776_django-16910:latest
swebench/sweb.eval.x86_64.django_1776_django-17051:latest
swebench/sweb.eval.x86_64.django_1776_django-17087:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-18869:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-22711:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-22835:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-23299:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-23314:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-23476:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-23562:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-23563:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-23913:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-23964:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-23987:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-24149:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-24265:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-24334:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-24970:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-25079:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-25311:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-25332:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-25433:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-25442:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-25498:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-26011:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-26020:latest
swebench/sweb.eval.x86_64.mwaskom_1776_seaborn-2848:latest
swebench/sweb.eval.x86_64.mwaskom_1776_seaborn-3010:latest
swebench/sweb.eval.x86_64.mwaskom_1776_seaborn-3190:latest
swebench/sweb.eval.x86_64.mwaskom_1776_seaborn-3407:latest
swebench/sweb.eval.x86_64.pallets_1776_flask-4045:latest
swebench/sweb.eval.x86_64.pallets_1776_flask-4992:latest
swebench/sweb.eval.x86_64.pallets_1776_flask-5063:latest
swebench/sweb.eval.x86_64.psf_1776_requests-1963:latest
swebench/sweb.eval.x86_64.psf_1776_requests-2148:latest
swebench/sweb.eval.x86_64.psf_1776_requests-2317:latest
swebench/sweb.eval.x86_64.psf_1776_requests-2674:latest
swebench/sweb.eval.x86_64.psf_1776_requests-3362:latest
swebench/sweb.eval.x86_64.psf_1776_requests-863:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-3364:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-4094:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-4248:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-4493:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-5131:latest
swebench/sweb.eval.x86_64.pylint-dev_1776_pylint-5859:latest
swebench/sweb.eval.x86_64.pylint-dev_1776_pylint-6506:latest
swebench/sweb.eval.x86_64.pylint-dev_1776_pylint-7080:latest
swebench/sweb.eval.x86_64.pylint-dev_1776_pylint-7114:latest
swebench/sweb.eval.x86_64.pylint-dev_1776_pylint-7228:latest
swebench/sweb.eval.x86_64.pylint-dev_1776_pylint-7993:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-11143:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-11148:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-5103:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-5221:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-5227:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-5413:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-5495:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-5692:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-6116:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-7168:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-7220:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-7373:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-7432:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-7490:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-8365:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-8906:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-9359:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-10297:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-10508:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-10949:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-11040:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-11281:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-12471:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-13142:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-13241:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-13439:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-13496:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-13497:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-13584:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-13779:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-14087:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-14092:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-14894:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-14983:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-15512:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-15535:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-25500:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-25570:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-25638:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-25747:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-10325:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-10451:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-11445:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-7686:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-7738:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-7975:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8273:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8282:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8435:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8474:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8506:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8595:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8627:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8713:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8721:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8801:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-11400:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-11870:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-11897:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-12171:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-12236:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-12419:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-12454:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-12481:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-13031:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-13043:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-13146:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-13177:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-13437:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-13471:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-13480:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-13647:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-13773:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-13895:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-13915:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-13971:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-14024:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-14308:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-14317:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-14396:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-14774:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-14817:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-15011:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-15308:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-15345:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-15346:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-15609:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-15678:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-16106:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-16281:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-16503:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-16792:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-16988:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-17022:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-17139:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-17630:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-17655:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-18057:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-18087:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-18189:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-18199:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-18532:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-18621:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-18698:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-18835:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-19007:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-19254:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-19487:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-20049:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-20154:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-20212:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-20322:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-20442:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-20590:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-20639:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-21055:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-21171:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-21379:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-21612:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-21614:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-21627:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-21847:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-22005:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-22714:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-22840:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-23117:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-23191:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-23262:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-24066:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-24102:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-24152:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-24213:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-24909:latest

View File

@ -0,0 +1,500 @@
swebench/sweb.eval.x86_64.astropy_1776_astropy-12907:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-13033:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-13236:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-13398:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-13453:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-13579:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-13977:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-14096:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-14182:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-14309:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-14365:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-14369:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-14508:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-14539:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-14598:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-14995:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-7166:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-7336:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-7606:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-7671:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-8707:latest
swebench/sweb.eval.x86_64.astropy_1776_astropy-8872:latest
swebench/sweb.eval.x86_64.django_1776_django-10097:latest
swebench/sweb.eval.x86_64.django_1776_django-10554:latest
swebench/sweb.eval.x86_64.django_1776_django-10880:latest
swebench/sweb.eval.x86_64.django_1776_django-10914:latest
swebench/sweb.eval.x86_64.django_1776_django-10973:latest
swebench/sweb.eval.x86_64.django_1776_django-10999:latest
swebench/sweb.eval.x86_64.django_1776_django-11066:latest
swebench/sweb.eval.x86_64.django_1776_django-11087:latest
swebench/sweb.eval.x86_64.django_1776_django-11095:latest
swebench/sweb.eval.x86_64.django_1776_django-11099:latest
swebench/sweb.eval.x86_64.django_1776_django-11119:latest
swebench/sweb.eval.x86_64.django_1776_django-11133:latest
swebench/sweb.eval.x86_64.django_1776_django-11138:latest
swebench/sweb.eval.x86_64.django_1776_django-11141:latest
swebench/sweb.eval.x86_64.django_1776_django-11149:latest
swebench/sweb.eval.x86_64.django_1776_django-11163:latest
swebench/sweb.eval.x86_64.django_1776_django-11179:latest
swebench/sweb.eval.x86_64.django_1776_django-11206:latest
swebench/sweb.eval.x86_64.django_1776_django-11211:latest
swebench/sweb.eval.x86_64.django_1776_django-11239:latest
swebench/sweb.eval.x86_64.django_1776_django-11265:latest
swebench/sweb.eval.x86_64.django_1776_django-11276:latest
swebench/sweb.eval.x86_64.django_1776_django-11292:latest
swebench/sweb.eval.x86_64.django_1776_django-11299:latest
swebench/sweb.eval.x86_64.django_1776_django-11333:latest
swebench/sweb.eval.x86_64.django_1776_django-11400:latest
swebench/sweb.eval.x86_64.django_1776_django-11433:latest
swebench/sweb.eval.x86_64.django_1776_django-11451:latest
swebench/sweb.eval.x86_64.django_1776_django-11477:latest
swebench/sweb.eval.x86_64.django_1776_django-11490:latest
swebench/sweb.eval.x86_64.django_1776_django-11532:latest
swebench/sweb.eval.x86_64.django_1776_django-11551:latest
swebench/sweb.eval.x86_64.django_1776_django-11555:latest
swebench/sweb.eval.x86_64.django_1776_django-11603:latest
swebench/sweb.eval.x86_64.django_1776_django-11728:latest
swebench/sweb.eval.x86_64.django_1776_django-11734:latest
swebench/sweb.eval.x86_64.django_1776_django-11740:latest
swebench/sweb.eval.x86_64.django_1776_django-11749:latest
swebench/sweb.eval.x86_64.django_1776_django-11790:latest
swebench/sweb.eval.x86_64.django_1776_django-11815:latest
swebench/sweb.eval.x86_64.django_1776_django-11820:latest
swebench/sweb.eval.x86_64.django_1776_django-11848:latest
swebench/sweb.eval.x86_64.django_1776_django-11880:latest
swebench/sweb.eval.x86_64.django_1776_django-11885:latest
swebench/sweb.eval.x86_64.django_1776_django-11951:latest
swebench/sweb.eval.x86_64.django_1776_django-11964:latest
swebench/sweb.eval.x86_64.django_1776_django-11999:latest
swebench/sweb.eval.x86_64.django_1776_django-12039:latest
swebench/sweb.eval.x86_64.django_1776_django-12050:latest
swebench/sweb.eval.x86_64.django_1776_django-12125:latest
swebench/sweb.eval.x86_64.django_1776_django-12143:latest
swebench/sweb.eval.x86_64.django_1776_django-12155:latest
swebench/sweb.eval.x86_64.django_1776_django-12193:latest
swebench/sweb.eval.x86_64.django_1776_django-12209:latest
swebench/sweb.eval.x86_64.django_1776_django-12262:latest
swebench/sweb.eval.x86_64.django_1776_django-12273:latest
swebench/sweb.eval.x86_64.django_1776_django-12276:latest
swebench/sweb.eval.x86_64.django_1776_django-12304:latest
swebench/sweb.eval.x86_64.django_1776_django-12308:latest
swebench/sweb.eval.x86_64.django_1776_django-12325:latest
swebench/sweb.eval.x86_64.django_1776_django-12406:latest
swebench/sweb.eval.x86_64.django_1776_django-12419:latest
swebench/sweb.eval.x86_64.django_1776_django-12663:latest
swebench/sweb.eval.x86_64.django_1776_django-12708:latest
swebench/sweb.eval.x86_64.django_1776_django-12713:latest
swebench/sweb.eval.x86_64.django_1776_django-12741:latest
swebench/sweb.eval.x86_64.django_1776_django-12754:latest
swebench/sweb.eval.x86_64.django_1776_django-12774:latest
swebench/sweb.eval.x86_64.django_1776_django-12858:latest
swebench/sweb.eval.x86_64.django_1776_django-12965:latest
swebench/sweb.eval.x86_64.django_1776_django-13012:latest
swebench/sweb.eval.x86_64.django_1776_django-13023:latest
swebench/sweb.eval.x86_64.django_1776_django-13028:latest
swebench/sweb.eval.x86_64.django_1776_django-13033:latest
swebench/sweb.eval.x86_64.django_1776_django-13089:latest
swebench/sweb.eval.x86_64.django_1776_django-13109:latest
swebench/sweb.eval.x86_64.django_1776_django-13112:latest
swebench/sweb.eval.x86_64.django_1776_django-13121:latest
swebench/sweb.eval.x86_64.django_1776_django-13128:latest
swebench/sweb.eval.x86_64.django_1776_django-13158:latest
swebench/sweb.eval.x86_64.django_1776_django-13195:latest
swebench/sweb.eval.x86_64.django_1776_django-13212:latest
swebench/sweb.eval.x86_64.django_1776_django-13279:latest
swebench/sweb.eval.x86_64.django_1776_django-13297:latest
swebench/sweb.eval.x86_64.django_1776_django-13315:latest
swebench/sweb.eval.x86_64.django_1776_django-13343:latest
swebench/sweb.eval.x86_64.django_1776_django-13344:latest
swebench/sweb.eval.x86_64.django_1776_django-13346:latest
swebench/sweb.eval.x86_64.django_1776_django-13363:latest
swebench/sweb.eval.x86_64.django_1776_django-13401:latest
swebench/sweb.eval.x86_64.django_1776_django-13406:latest
swebench/sweb.eval.x86_64.django_1776_django-13410:latest
swebench/sweb.eval.x86_64.django_1776_django-13417:latest
swebench/sweb.eval.x86_64.django_1776_django-13449:latest
swebench/sweb.eval.x86_64.django_1776_django-13512:latest
swebench/sweb.eval.x86_64.django_1776_django-13513:latest
swebench/sweb.eval.x86_64.django_1776_django-13516:latest
swebench/sweb.eval.x86_64.django_1776_django-13551:latest
swebench/sweb.eval.x86_64.django_1776_django-13568:latest
swebench/sweb.eval.x86_64.django_1776_django-13569:latest
swebench/sweb.eval.x86_64.django_1776_django-13590:latest
swebench/sweb.eval.x86_64.django_1776_django-13658:latest
swebench/sweb.eval.x86_64.django_1776_django-13670:latest
swebench/sweb.eval.x86_64.django_1776_django-13741:latest
swebench/sweb.eval.x86_64.django_1776_django-13786:latest
swebench/sweb.eval.x86_64.django_1776_django-13794:latest
swebench/sweb.eval.x86_64.django_1776_django-13807:latest
swebench/sweb.eval.x86_64.django_1776_django-13809:latest
swebench/sweb.eval.x86_64.django_1776_django-13810:latest
swebench/sweb.eval.x86_64.django_1776_django-13820:latest
swebench/sweb.eval.x86_64.django_1776_django-13821:latest
swebench/sweb.eval.x86_64.django_1776_django-13837:latest
swebench/sweb.eval.x86_64.django_1776_django-13925:latest
swebench/sweb.eval.x86_64.django_1776_django-13933:latest
swebench/sweb.eval.x86_64.django_1776_django-13964:latest
swebench/sweb.eval.x86_64.django_1776_django-14007:latest
swebench/sweb.eval.x86_64.django_1776_django-14011:latest
swebench/sweb.eval.x86_64.django_1776_django-14017:latest
swebench/sweb.eval.x86_64.django_1776_django-14034:latest
swebench/sweb.eval.x86_64.django_1776_django-14053:latest
swebench/sweb.eval.x86_64.django_1776_django-14089:latest
swebench/sweb.eval.x86_64.django_1776_django-14122:latest
swebench/sweb.eval.x86_64.django_1776_django-14140:latest
swebench/sweb.eval.x86_64.django_1776_django-14155:latest
swebench/sweb.eval.x86_64.django_1776_django-14170:latest
swebench/sweb.eval.x86_64.django_1776_django-14238:latest
swebench/sweb.eval.x86_64.django_1776_django-14311:latest
swebench/sweb.eval.x86_64.django_1776_django-14315:latest
swebench/sweb.eval.x86_64.django_1776_django-14349:latest
swebench/sweb.eval.x86_64.django_1776_django-14351:latest
swebench/sweb.eval.x86_64.django_1776_django-14373:latest
swebench/sweb.eval.x86_64.django_1776_django-14376:latest
swebench/sweb.eval.x86_64.django_1776_django-14404:latest
swebench/sweb.eval.x86_64.django_1776_django-14434:latest
swebench/sweb.eval.x86_64.django_1776_django-14493:latest
swebench/sweb.eval.x86_64.django_1776_django-14500:latest
swebench/sweb.eval.x86_64.django_1776_django-14534:latest
swebench/sweb.eval.x86_64.django_1776_django-14539:latest
swebench/sweb.eval.x86_64.django_1776_django-14559:latest
swebench/sweb.eval.x86_64.django_1776_django-14580:latest
swebench/sweb.eval.x86_64.django_1776_django-14608:latest
swebench/sweb.eval.x86_64.django_1776_django-14631:latest
swebench/sweb.eval.x86_64.django_1776_django-14672:latest
swebench/sweb.eval.x86_64.django_1776_django-14725:latest
swebench/sweb.eval.x86_64.django_1776_django-14752:latest
swebench/sweb.eval.x86_64.django_1776_django-14765:latest
swebench/sweb.eval.x86_64.django_1776_django-14771:latest
swebench/sweb.eval.x86_64.django_1776_django-14787:latest
swebench/sweb.eval.x86_64.django_1776_django-14792:latest
swebench/sweb.eval.x86_64.django_1776_django-14855:latest
swebench/sweb.eval.x86_64.django_1776_django-14915:latest
swebench/sweb.eval.x86_64.django_1776_django-14999:latest
swebench/sweb.eval.x86_64.django_1776_django-15022:latest
swebench/sweb.eval.x86_64.django_1776_django-15037:latest
swebench/sweb.eval.x86_64.django_1776_django-15098:latest
swebench/sweb.eval.x86_64.django_1776_django-15103:latest
swebench/sweb.eval.x86_64.django_1776_django-15104:latest
swebench/sweb.eval.x86_64.django_1776_django-15127:latest
swebench/sweb.eval.x86_64.django_1776_django-15128:latest
swebench/sweb.eval.x86_64.django_1776_django-15161:latest
swebench/sweb.eval.x86_64.django_1776_django-15252:latest
swebench/sweb.eval.x86_64.django_1776_django-15268:latest
swebench/sweb.eval.x86_64.django_1776_django-15277:latest
swebench/sweb.eval.x86_64.django_1776_django-15278:latest
swebench/sweb.eval.x86_64.django_1776_django-15280:latest
swebench/sweb.eval.x86_64.django_1776_django-15315:latest
swebench/sweb.eval.x86_64.django_1776_django-15368:latest
swebench/sweb.eval.x86_64.django_1776_django-15375:latest
swebench/sweb.eval.x86_64.django_1776_django-15380:latest
swebench/sweb.eval.x86_64.django_1776_django-15382:latest
swebench/sweb.eval.x86_64.django_1776_django-15467:latest
swebench/sweb.eval.x86_64.django_1776_django-15499:latest
swebench/sweb.eval.x86_64.django_1776_django-15503:latest
swebench/sweb.eval.x86_64.django_1776_django-15525:latest
swebench/sweb.eval.x86_64.django_1776_django-15554:latest
swebench/sweb.eval.x86_64.django_1776_django-15561:latest
swebench/sweb.eval.x86_64.django_1776_django-15563:latest
swebench/sweb.eval.x86_64.django_1776_django-15569:latest
swebench/sweb.eval.x86_64.django_1776_django-15572:latest
swebench/sweb.eval.x86_64.django_1776_django-15629:latest
swebench/sweb.eval.x86_64.django_1776_django-15695:latest
swebench/sweb.eval.x86_64.django_1776_django-15731:latest
swebench/sweb.eval.x86_64.django_1776_django-15732:latest
swebench/sweb.eval.x86_64.django_1776_django-15741:latest
swebench/sweb.eval.x86_64.django_1776_django-15814:latest
swebench/sweb.eval.x86_64.django_1776_django-15851:latest
swebench/sweb.eval.x86_64.django_1776_django-15863:latest
swebench/sweb.eval.x86_64.django_1776_django-15916:latest
swebench/sweb.eval.x86_64.django_1776_django-15930:latest
swebench/sweb.eval.x86_64.django_1776_django-15957:latest
swebench/sweb.eval.x86_64.django_1776_django-15973:latest
swebench/sweb.eval.x86_64.django_1776_django-15987:latest
swebench/sweb.eval.x86_64.django_1776_django-16032:latest
swebench/sweb.eval.x86_64.django_1776_django-16082:latest
swebench/sweb.eval.x86_64.django_1776_django-16100:latest
swebench/sweb.eval.x86_64.django_1776_django-16116:latest
swebench/sweb.eval.x86_64.django_1776_django-16136:latest
swebench/sweb.eval.x86_64.django_1776_django-16139:latest
swebench/sweb.eval.x86_64.django_1776_django-16145:latest
swebench/sweb.eval.x86_64.django_1776_django-16255:latest
swebench/sweb.eval.x86_64.django_1776_django-16256:latest
swebench/sweb.eval.x86_64.django_1776_django-16263:latest
swebench/sweb.eval.x86_64.django_1776_django-16315:latest
swebench/sweb.eval.x86_64.django_1776_django-16333:latest
swebench/sweb.eval.x86_64.django_1776_django-16429:latest
swebench/sweb.eval.x86_64.django_1776_django-16454:latest
swebench/sweb.eval.x86_64.django_1776_django-16485:latest
swebench/sweb.eval.x86_64.django_1776_django-16493:latest
swebench/sweb.eval.x86_64.django_1776_django-16502:latest
swebench/sweb.eval.x86_64.django_1776_django-16527:latest
swebench/sweb.eval.x86_64.django_1776_django-16560:latest
swebench/sweb.eval.x86_64.django_1776_django-16569:latest
swebench/sweb.eval.x86_64.django_1776_django-16595:latest
swebench/sweb.eval.x86_64.django_1776_django-16612:latest
swebench/sweb.eval.x86_64.django_1776_django-16631:latest
swebench/sweb.eval.x86_64.django_1776_django-16642:latest
swebench/sweb.eval.x86_64.django_1776_django-16661:latest
swebench/sweb.eval.x86_64.django_1776_django-16662:latest
swebench/sweb.eval.x86_64.django_1776_django-16667:latest
swebench/sweb.eval.x86_64.django_1776_django-16801:latest
swebench/sweb.eval.x86_64.django_1776_django-16819:latest
swebench/sweb.eval.x86_64.django_1776_django-16877:latest
swebench/sweb.eval.x86_64.django_1776_django-16899:latest
swebench/sweb.eval.x86_64.django_1776_django-16901:latest
swebench/sweb.eval.x86_64.django_1776_django-16938:latest
swebench/sweb.eval.x86_64.django_1776_django-16950:latest
swebench/sweb.eval.x86_64.django_1776_django-17029:latest
swebench/sweb.eval.x86_64.django_1776_django-17084:latest
swebench/sweb.eval.x86_64.django_1776_django-17087:latest
swebench/sweb.eval.x86_64.django_1776_django-7530:latest
swebench/sweb.eval.x86_64.django_1776_django-9296:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-13989:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-14623:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-20488:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-20676:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-20826:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-20859:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-21568:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-22719:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-22865:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-22871:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-23299:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-23314:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-23412:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-23476:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-24026:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-24149:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-24177:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-24570:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-24627:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-24637:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-24870:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-24970:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-25122:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-25287:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-25311:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-25332:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-25479:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-25775:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-25960:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-26113:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-26208:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-26291:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-26342:latest
swebench/sweb.eval.x86_64.matplotlib_1776_matplotlib-26466:latest
swebench/sweb.eval.x86_64.mwaskom_1776_seaborn-3069:latest
swebench/sweb.eval.x86_64.mwaskom_1776_seaborn-3187:latest
swebench/sweb.eval.x86_64.pallets_1776_flask-5014:latest
swebench/sweb.eval.x86_64.psf_1776_requests-1142:latest
swebench/sweb.eval.x86_64.psf_1776_requests-1724:latest
swebench/sweb.eval.x86_64.psf_1776_requests-1766:latest
swebench/sweb.eval.x86_64.psf_1776_requests-1921:latest
swebench/sweb.eval.x86_64.psf_1776_requests-2317:latest
swebench/sweb.eval.x86_64.psf_1776_requests-2931:latest
swebench/sweb.eval.x86_64.psf_1776_requests-5414:latest
swebench/sweb.eval.x86_64.psf_1776_requests-6028:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-2905:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-3095:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-3151:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-3305:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-3677:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-3993:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-4075:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-4094:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-4356:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-4629:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-4687:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-4695:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-4966:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-6461:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-6599:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-6721:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-6744:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-6938:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-6992:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-7229:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-7233:latest
swebench/sweb.eval.x86_64.pydata_1776_xarray-7393:latest
swebench/sweb.eval.x86_64.pylint-dev_1776_pylint-4551:latest
swebench/sweb.eval.x86_64.pylint-dev_1776_pylint-4604:latest
swebench/sweb.eval.x86_64.pylint-dev_1776_pylint-4661:latest
swebench/sweb.eval.x86_64.pylint-dev_1776_pylint-4970:latest
swebench/sweb.eval.x86_64.pylint-dev_1776_pylint-6386:latest
swebench/sweb.eval.x86_64.pylint-dev_1776_pylint-6528:latest
swebench/sweb.eval.x86_64.pylint-dev_1776_pylint-6903:latest
swebench/sweb.eval.x86_64.pylint-dev_1776_pylint-7080:latest
swebench/sweb.eval.x86_64.pylint-dev_1776_pylint-7277:latest
swebench/sweb.eval.x86_64.pylint-dev_1776_pylint-8898:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-10051:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-10081:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-10356:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-5262:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-5631:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-5787:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-5809:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-5840:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-6197:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-6202:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-7205:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-7236:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-7324:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-7432:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-7490:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-7521:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-7571:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-7982:latest
swebench/sweb.eval.x86_64.pytest-dev_1776_pytest-8399:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-10297:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-10844:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-10908:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-11310:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-11578:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-12585:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-12682:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-12973:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-13124:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-13135:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-13142:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-13328:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-13439:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-13496:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-13779:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-14053:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-14087:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-14141:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-14496:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-14629:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-14710:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-14894:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-14983:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-15100:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-25102:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-25232:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-25747:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-25931:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-25973:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-26194:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-26323:latest
swebench/sweb.eval.x86_64.scikit-learn_1776_scikit-learn-9288:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-10323:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-10435:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-10449:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-10466:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-10614:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-10673:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-11445:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-11510:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-7440:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-7454:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-7462:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-7590:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-7748:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-7757:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-7889:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-7910:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-7985:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8035:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8056:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8120:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8265:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8269:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8459:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8475:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8548:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8551:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8593:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8595:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8621:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8638:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-8721:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-9229:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-9230:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-9258:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-9281:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-9320:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-9367:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-9461:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-9591:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-9602:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-9658:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-9673:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-9698:latest
swebench/sweb.eval.x86_64.sphinx-doc_1776_sphinx-9711:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-11618:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-12096:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-12419:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-12481:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-12489:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-13031:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-13091:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-13372:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-13480:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-13551:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-13615:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-13647:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-13757:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-13798:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-13852:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-13877:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-13878:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-13974:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-14248:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-14531:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-14711:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-14976:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-15017:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-15345:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-15349:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-15599:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-15809:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-15875:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-15976:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-16450:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-16597:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-16766:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-16792:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-16886:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-17139:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-17318:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-17630:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-17655:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-18189:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-18199:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-18211:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-18698:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-18763:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-19040:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-19346:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-19495:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-19637:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-19783:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-19954:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-20154:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-20428:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-20438:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-20590:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-20801:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-20916:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-21379:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-21596:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-21612:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-21847:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-21930:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-22080:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-22456:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-22714:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-22914:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-23262:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-23413:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-23534:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-23824:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-23950:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-24066:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-24213:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-24443:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-24539:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-24562:latest
swebench/sweb.eval.x86_64.sympy_1776_sympy-24661:latest

View File

@ -0,0 +1,50 @@
"""Get official docker image names for SWE-bench instances."""
import argparse
from datasets import load_dataset
parser = argparse.ArgumentParser()
parser.add_argument('--dataset', type=str, default='princeton-nlp/SWE-bench')
parser.add_argument('--split', type=str, default='test')
parser.add_argument('--output', type=str, default='swebench_images.txt')
args = parser.parse_args()
SUPPORTED_DATASET = {
'princeton-nlp/SWE-bench_Multimodal',
'princeton-nlp/SWE-bench',
'princeton-nlp/SWE-bench_Lite',
'princeton-nlp/SWE-bench_Verified',
}
assert args.dataset in SUPPORTED_DATASET, f'Dataset {args.dataset} not supported'
def swebench_instance_id_to_docker_image_name(instance_id: str) -> str:
# swebench/sweb.eval.x86_64.django_1776_django-11333:v1
repo, name = instance_id.split('__')
return f'swebench/sweb.eval.x86_64.{repo}_1776_{name}:latest'
def swebench_multimodal_instance_id_to_docker_image_name(instance_id: str) -> str:
# swebench/sweb.mm.eval.x86_64.openlayers_1776_openlayers-12172
repo, name = instance_id.split('__')
return f'swebench/sweb.mm.eval.x86_64.{repo}_1776_{name}:latest'
dataset = load_dataset(args.dataset, split=args.split)
instance_ids = dataset['instance_id']
print(f'Loading {len(instance_ids)} instances from {args.dataset} split {args.split}')
with open(args.output, 'w') as f:
for instance_id in instance_ids:
if args.dataset in [
'princeton-nlp/SWE-bench',
'princeton-nlp/SWE-bench_Lite',
'princeton-nlp/SWE-bench_Verified',
]:
f.write(swebench_instance_id_to_docker_image_name(instance_id) + '\n')
else:
f.write(
swebench_multimodal_instance_id_to_docker_image_name(instance_id) + '\n'
)
print(f'Saved {len(instance_ids)} images to {args.output}')

View File

@ -1,66 +1,36 @@
#!/usr/bin/env bash
set -e
LEVEL=$1
# three levels:
# - base, keyword "sweb.base"
# - env, keyword "sweb.env"
# - instance, keyword "sweb.eval"
SET=$2
if [ -z "$LEVEL" ]; then
echo "Usage: $0 <cache_level> <set>"
echo "cache_level: base, env, or instance"
echo "set: lite, full"
SET=$1
# check set is in ["full", "lite", "verified"]
if [ "$SET" != "full" ] && [ "$SET" != "lite" ] && [ "$SET" != "verified" ]; then
echo "Error: argument 1 must be one of: full, lite, verified"
exit 1
fi
if [ -z "$SET" ]; then
echo "Usage: $0 <cache_level> <set>"
echo "cache_level: base, env, or instance"
echo "set: lite, full, default is lite"
SET="lite"
input_file=evaluation/benchmarks/swe_bench/scripts/docker/all-swebench-${SET}-instance-images.txt
echo "Downloading images based on ${input_file}"
# Check if the file exists
if [ ! -f "$input_file" ]; then
echo "Error: File '$input_file' not found"
exit 1
fi
# Check if namespace is provided via argument $3, otherwise default to 'xingyaoww'
NAMESPACE=${3:-xingyaoww}
# Get total number of images
total_images=$(wc -l < "${input_file}")
counter=0
echo "Using namespace: $NAMESPACE"
echo "Starting to pull ${total_images} images"
if [ "$SET" == "lite" ]; then
IMAGE_FILE="$(dirname "$0")/all-swebench-lite-instance-images.txt"
else
IMAGE_FILE="$(dirname "$0")/all-swebench-full-instance-images.txt"
fi
# Read the file line by line and pull each image
while IFS= read -r image; do
# Skip empty lines or comments
if [ -n "$image" ] && [[ ! "$image" =~ ^[[:space:]]*# ]]; then
counter=$((counter + 1))
echo "[${counter}/${total_images}] Pulling ${image}"
docker pull "${image}"
sleep 2
fi
done < "${input_file}"
# Define a pattern based on the level
case $LEVEL in
base)
PATTERN="sweb.base"
;;
env)
PATTERN="sweb.base\|sweb.env"
;;
instance)
PATTERN="sweb.base\|sweb.env\|sweb.eval"
;;
*)
echo "Invalid cache level: $LEVEL"
echo "Valid levels are: base, env, instance"
exit 1
;;
esac
echo "Pulling docker images for [$LEVEL] level"
echo "Pattern: $PATTERN"
echo "Image file: $IMAGE_FILE"
# Read each line from the file, filter by pattern, and pull the docker image
grep "$PATTERN" "$IMAGE_FILE" | while IFS= read -r image; do
echo "Pulling $NAMESPACE/$image into $image"
docker pull $NAMESPACE/$image
# replace _s_ to __ in the image name
renamed_image=$(echo "$image" | sed 's/_s_/__/g')
docker tag $NAMESPACE/$image $renamed_image
done
echo "Finished pulling all images"

View File

@ -1,6 +1,7 @@
#!/usr/bin/env python3
import argparse
import os
import subprocess
import pandas as pd
from termcolor import colored
@ -35,6 +36,23 @@ if args.only_x_instances:
f'After removing instances not in X={args.input_file_1}: Y={df2.shape[0]} instances'
)
# Add summarization step for each input file
def summarize_file(file_path):
script_dir = os.path.dirname(os.path.abspath(__file__))
summarize_script = os.path.join(script_dir, 'summarize_outputs.py')
print(f'\nSummary for {file_path}:')
print('=' * 80)
subprocess.run(['python', summarize_script, file_path], check=True)
print('=' * 80)
# Generate summaries
summarize_file(args.input_file_1)
summarize_file(args.input_file_2)
# Get the intersection of the instance_ids
df = pd.merge(df1, df2, on='instance_id', how='inner')

View File

@ -248,6 +248,22 @@ def write_row_to_md_file(row, instance_id_to_test_result):
completions = load_completions(instance_id)
# report file
global output_dir
report_file = os.path.join(output_dir, 'eval_outputs', instance_id, 'report.json')
if os.path.exists(report_file):
with open(report_file, 'r') as f:
report = json.load(f)
else:
report = None
test_output_file = os.path.join(
output_dir, 'eval_outputs', instance_id, 'test_output.txt'
)
if test_output is None and os.path.exists(test_output_file):
with open(test_output_file, 'r') as f:
test_output = f.read()
with open(filepath, 'w') as f:
f.write(f'# {instance_id} (resolved: {resolved})\n')
@ -269,8 +285,14 @@ def write_row_to_md_file(row, instance_id_to_test_result):
f.write('## Model Patch\n')
f.write(f'{process_git_patch(model_patch)}\n')
if report is not None:
f.write('## Report\n')
f.write(json.dumps(report, indent=2))
f.write('\n')
f.write('## Test Output\n')
f.write(str(test_output))
f.write('\n')
instance_id_to_test_result = {}

View File

@ -44,7 +44,6 @@ if os.path.exists(swebench_official_report_json):
f"- resolved instances: {report['resolved_instances']}\n"
f"- unresolved instances: {report['unresolved_instances']}\n"
f"- error instances: {report['error_instances']}\n"
f"- unstopped instances: {report['unstopped_instances']}\n"
)
output_md += '\n## Resolved Instances\n'

View File

@ -247,11 +247,21 @@ def prepare_dataset(
f'Starting evaluation with skipping first {skip_num} instances ({len(dataset)} instances to run).'
)
if eval_n_limit and eval_n_limit > 0:
dataset = dataset.head(eval_n_limit)
logger.info(f'Limiting evaluation to {eval_n_limit} instances.')
# Use fixed random seed 42 for sampling without replacement
dataset = dataset.sample(
min(eval_n_limit, len(dataset)), random_state=42, replace=False
)
logger.info(
f'Randomly sampling {eval_n_limit} unique instances with random seed 42.'
)
elif eval_n_limit and eval_n_limit > 0:
dataset = dataset.head(eval_n_limit)
logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
# Use fixed random seed 42 for sampling without replacement
dataset = dataset.sample(
min(eval_n_limit, len(dataset)), random_state=42, replace=False
)
logger.info(
f'Randomly sampling {eval_n_limit} unique instances with random seed 42.'
)
new_dataset = [
instance

21
poetry.lock generated
View File

@ -9106,13 +9106,15 @@ files = [
[[package]]
name = "swebench"
version = "2.0.13"
version = "3.0.8"
description = "The official SWE-bench package - a benchmark for evaluating LMs on software engineering"
optional = false
python-versions = ">=3.8"
groups = ["evaluation"]
files = []
develop = false
files = [
{file = "swebench-3.0.8-py3-none-any.whl", hash = "sha256:daea564215dc77fc27998405a68e7b40880d25ed408813fe0ccd890bcc249a02"},
{file = "swebench-3.0.8.tar.gz", hash = "sha256:f86f8412690c808592b3accb20c018f9cf480dbafa21525e065a138dd06b6e1f"},
]
[package.dependencies]
beautifulsoup4 = "*"
@ -9121,21 +9123,18 @@ datasets = "*"
docker = "*"
ghapi = "*"
GitPython = "*"
modal = "*"
pre-commit = "*"
python-dotenv = "*"
requests = "*"
rich = "*"
tenacity = "*"
tqdm = "*"
unidiff = "*"
[package.extras]
inference = ["anthropic", "flash_attn", "jedi", "openai", "peft", "protobuf", "sentencepiece", "tenacity", "tiktoken", "torch", "transformers", "triton"]
[package.source]
type = "git"
url = "https://github.com/All-Hands-AI/SWE-bench.git"
reference = "HEAD"
resolved_reference = "c807c112edc3dcb4fdf5ddac63b34706912d5cdb"
inference = ["anthropic", "flash_attn", "jedi", "openai", "peft", "protobuf", "sentencepiece", "tiktoken", "torch", "transformers", "triton"]
test = ["pytest", "pytest-cov"]
[[package]]
name = "sympy"
@ -10853,4 +10852,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"]
[metadata]
lock-version = "2.1"
python-versions = "^3.12"
content-hash = "6162482b9821778fed90d6cf4e252f90bf4dd70a44f44295837297d24f440138"
content-hash = "0aa5dc28564265aa19b0c90e6f65cd2b086a373ecdaa5c521542aa19d3c84ecf"

View File

@ -144,7 +144,7 @@ streamlit = "*"
whatthepatch = "*"
retry = "*"
evaluate = "*"
swebench = { git = "https://github.com/All-Hands-AI/SWE-bench.git" }
swebench = "^3.0.8"
commit0 = "*"
func_timeout = "*"
sympy = "*"