SWE-Gym rollout stability fix & using a validated SWE-Gym set (#7182)

Co-authored-by: Robert Brennan <accounts@rbren.io>
Co-authored-by: openhands <openhands@all-hands.dev>
Co-authored-by: Engel Nyst <enyst@users.noreply.github.com>
Co-authored-by: Graham Neubig <neubig@gmail.com>
This commit is contained in:
Xingyao Wang 2025-03-17 09:15:01 -04:00 committed by GitHub
parent 4f017081fc
commit a4d632498c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 4287 additions and 23 deletions

View File

@ -1,3 +1,4 @@
import copy
import json
import os
import subprocess
@ -175,6 +176,11 @@ def process_instance(
logger.warning(
f'This is the {runtime_failure_count + 1}th attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}'
)
metadata = copy.deepcopy(metadata)
metadata.details['runtime_failure_count'] = runtime_failure_count
metadata.details['remote_runtime_resource_factor'] = (
config.sandbox.remote_runtime_resource_factor
)
try:
runtime = create_runtime(config)
@ -296,14 +302,20 @@ def process_instance(
with open(test_output_path, 'w') as f:
f.write(test_output)
try:
extra_kwargs = {}
if 'SWE-Gym' in metadata.dataset:
# SWE-Gym uses a different version of the package, hence a different eval report argument
extra_kwargs['log_path'] = test_output_path
else:
extra_kwargs['test_log_path'] = test_output_path
_report = conditional_imports.get_eval_report(
test_spec=test_spec,
prediction={
'model_patch': model_patch,
'instance_id': instance_id,
},
test_log_path=test_output_path,
include_tests_status=True,
**extra_kwargs,
)
report = _report[instance_id]
logger.info(
@ -463,6 +475,7 @@ if __name__ == '__main__':
.decode('utf-8')
.strip(), # Current commit
dataset=args.dataset, # Dataset name from args
details={},
)
# The evaluation harness constrains the signature of `process_instance_func` but we need to

File diff suppressed because it is too large Load Diff

View File

@ -23,7 +23,7 @@ def get_resource_mapping(dataset_name: str) -> dict[str, float]:
if dataset_name not in _global_resource_mapping:
file_path = os.path.join(CUR_DIR, f'{dataset_name}.json')
if not os.path.exists(file_path):
logger.warning(f'Resource mapping for {dataset_name} not found.')
logger.info(f'Resource mapping for {dataset_name} not found.')
return None
with open(file_path, 'r') as f:

View File

@ -1,4 +1,5 @@
import asyncio
import copy
import json
import os
import tempfile
@ -149,7 +150,8 @@ def get_config(
) -> AppConfig:
# We use a different instance image for the each instance of swe-bench eval
use_official_image = bool(
'verified' in metadata.dataset.lower() or 'lite' in metadata.dataset.lower()
('verified' in metadata.dataset.lower() or 'lite' in metadata.dataset.lower())
and 'swe-gym' not in metadata.dataset.lower()
)
base_container_image = get_instance_docker_image(
instance['instance_id'], use_official_image
@ -475,6 +477,13 @@ def process_instance(
logger.warning(
f'This is the {runtime_failure_count + 1}th attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}'
)
metadata = copy.deepcopy(metadata)
metadata.details['runtime_failure_count'] = runtime_failure_count
metadata.details['remote_runtime_resource_factor'] = (
config.sandbox.remote_runtime_resource_factor
)
runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
@ -560,20 +569,6 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
return dataset
# A list of instances that are known to be tricky to infer
# (will cause runtime failure even with resource factor = 8)
SWEGYM_EXCLUDE_IDS = [
'dask__dask-10422',
'pandas-dev__pandas-50548',
'pandas-dev__pandas-53672',
'pandas-dev__pandas-54174',
'pandas-dev__pandas-55518',
'pandas-dev__pandas-58383',
'pydata__xarray-6721',
'pytest-dev__pytest-10081',
'pytest-dev__pytest-7236',
]
if __name__ == '__main__':
parser = get_parser()
parser.add_argument(
@ -598,11 +593,20 @@ if __name__ == '__main__':
f'Loaded dataset {args.dataset} with split {args.split}: {len(swe_bench_tests)} tasks'
)
if 'SWE-Gym' in args.dataset:
swe_bench_tests = swe_bench_tests[
~swe_bench_tests['instance_id'].isin(SWEGYM_EXCLUDE_IDS)
]
with open(
os.path.join(
os.path.dirname(os.path.abspath(__file__)),
'split',
'swegym_verified_instances.json',
),
'r',
) as f:
swegym_verified_instances = json.load(f)
swe_bench_tests = swe_bench_tests[
swe_bench_tests['instance_id'].isin(swegym_verified_instances)
]
logger.info(
f'{len(swe_bench_tests)} tasks left after excluding SWE-Gym excluded tasks'
f'{len(swe_bench_tests)} tasks left after filtering for SWE-Gym verified instances'
)
llm_config = None

View File

@ -9,7 +9,7 @@ parser.add_argument(
'--dataset_name',
type=str,
help='Name of the dataset to download',
default='princeton-nlp/SWE-bench_Lite',
default='princeton-nlp/SWE-bench_Verified',
)
parser.add_argument('--split', type=str, help='Split to download', default='test')
args = parser.parse_args()
@ -20,7 +20,12 @@ print(
f'Downloading gold patches from {args.dataset_name} (split: {args.split}) to {output_filepath}'
)
patches = [
{'instance_id': row['instance_id'], 'model_patch': row['patch']} for row in dataset
{
'instance_id': row['instance_id'],
'model_patch': row['patch'],
'model_name_or_path': 'gold',
}
for row in dataset
]
print(f'{len(patches)} gold patches loaded')
pd.DataFrame(patches).to_json(output_filepath, lines=True, orient='records')

File diff suppressed because it is too large Load Diff