mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
SWE-Gym rollout stability fix & using a validated SWE-Gym set (#7182)
Co-authored-by: Robert Brennan <accounts@rbren.io> Co-authored-by: openhands <openhands@all-hands.dev> Co-authored-by: Engel Nyst <enyst@users.noreply.github.com> Co-authored-by: Graham Neubig <neubig@gmail.com>
This commit is contained in:
parent
4f017081fc
commit
a4d632498c
@ -1,3 +1,4 @@
|
||||
import copy
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
@ -175,6 +176,11 @@ def process_instance(
|
||||
logger.warning(
|
||||
f'This is the {runtime_failure_count + 1}th attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}'
|
||||
)
|
||||
metadata = copy.deepcopy(metadata)
|
||||
metadata.details['runtime_failure_count'] = runtime_failure_count
|
||||
metadata.details['remote_runtime_resource_factor'] = (
|
||||
config.sandbox.remote_runtime_resource_factor
|
||||
)
|
||||
|
||||
try:
|
||||
runtime = create_runtime(config)
|
||||
@ -296,14 +302,20 @@ def process_instance(
|
||||
with open(test_output_path, 'w') as f:
|
||||
f.write(test_output)
|
||||
try:
|
||||
extra_kwargs = {}
|
||||
if 'SWE-Gym' in metadata.dataset:
|
||||
# SWE-Gym uses a different version of the package, hence a different eval report argument
|
||||
extra_kwargs['log_path'] = test_output_path
|
||||
else:
|
||||
extra_kwargs['test_log_path'] = test_output_path
|
||||
_report = conditional_imports.get_eval_report(
|
||||
test_spec=test_spec,
|
||||
prediction={
|
||||
'model_patch': model_patch,
|
||||
'instance_id': instance_id,
|
||||
},
|
||||
test_log_path=test_output_path,
|
||||
include_tests_status=True,
|
||||
**extra_kwargs,
|
||||
)
|
||||
report = _report[instance_id]
|
||||
logger.info(
|
||||
@ -463,6 +475,7 @@ if __name__ == '__main__':
|
||||
.decode('utf-8')
|
||||
.strip(), # Current commit
|
||||
dataset=args.dataset, # Dataset name from args
|
||||
details={},
|
||||
)
|
||||
|
||||
# The evaluation harness constrains the signature of `process_instance_func` but we need to
|
||||
|
||||
2121
evaluation/benchmarks/swe_bench/resource/SWE-Gym__SWE-Gym-train.json
Normal file
2121
evaluation/benchmarks/swe_bench/resource/SWE-Gym__SWE-Gym-train.json
Normal file
File diff suppressed because it is too large
Load Diff
@ -23,7 +23,7 @@ def get_resource_mapping(dataset_name: str) -> dict[str, float]:
|
||||
if dataset_name not in _global_resource_mapping:
|
||||
file_path = os.path.join(CUR_DIR, f'{dataset_name}.json')
|
||||
if not os.path.exists(file_path):
|
||||
logger.warning(f'Resource mapping for {dataset_name} not found.')
|
||||
logger.info(f'Resource mapping for {dataset_name} not found.')
|
||||
return None
|
||||
|
||||
with open(file_path, 'r') as f:
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
import asyncio
|
||||
import copy
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
@ -149,7 +150,8 @@ def get_config(
|
||||
) -> AppConfig:
|
||||
# We use a different instance image for the each instance of swe-bench eval
|
||||
use_official_image = bool(
|
||||
'verified' in metadata.dataset.lower() or 'lite' in metadata.dataset.lower()
|
||||
('verified' in metadata.dataset.lower() or 'lite' in metadata.dataset.lower())
|
||||
and 'swe-gym' not in metadata.dataset.lower()
|
||||
)
|
||||
base_container_image = get_instance_docker_image(
|
||||
instance['instance_id'], use_official_image
|
||||
@ -475,6 +477,13 @@ def process_instance(
|
||||
logger.warning(
|
||||
f'This is the {runtime_failure_count + 1}th attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}'
|
||||
)
|
||||
|
||||
metadata = copy.deepcopy(metadata)
|
||||
metadata.details['runtime_failure_count'] = runtime_failure_count
|
||||
metadata.details['remote_runtime_resource_factor'] = (
|
||||
config.sandbox.remote_runtime_resource_factor
|
||||
)
|
||||
|
||||
runtime = create_runtime(config)
|
||||
call_async_from_sync(runtime.connect)
|
||||
|
||||
@ -560,20 +569,6 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
|
||||
return dataset
|
||||
|
||||
|
||||
# A list of instances that are known to be tricky to infer
|
||||
# (will cause runtime failure even with resource factor = 8)
|
||||
SWEGYM_EXCLUDE_IDS = [
|
||||
'dask__dask-10422',
|
||||
'pandas-dev__pandas-50548',
|
||||
'pandas-dev__pandas-53672',
|
||||
'pandas-dev__pandas-54174',
|
||||
'pandas-dev__pandas-55518',
|
||||
'pandas-dev__pandas-58383',
|
||||
'pydata__xarray-6721',
|
||||
'pytest-dev__pytest-10081',
|
||||
'pytest-dev__pytest-7236',
|
||||
]
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = get_parser()
|
||||
parser.add_argument(
|
||||
@ -598,11 +593,20 @@ if __name__ == '__main__':
|
||||
f'Loaded dataset {args.dataset} with split {args.split}: {len(swe_bench_tests)} tasks'
|
||||
)
|
||||
if 'SWE-Gym' in args.dataset:
|
||||
swe_bench_tests = swe_bench_tests[
|
||||
~swe_bench_tests['instance_id'].isin(SWEGYM_EXCLUDE_IDS)
|
||||
]
|
||||
with open(
|
||||
os.path.join(
|
||||
os.path.dirname(os.path.abspath(__file__)),
|
||||
'split',
|
||||
'swegym_verified_instances.json',
|
||||
),
|
||||
'r',
|
||||
) as f:
|
||||
swegym_verified_instances = json.load(f)
|
||||
swe_bench_tests = swe_bench_tests[
|
||||
swe_bench_tests['instance_id'].isin(swegym_verified_instances)
|
||||
]
|
||||
logger.info(
|
||||
f'{len(swe_bench_tests)} tasks left after excluding SWE-Gym excluded tasks'
|
||||
f'{len(swe_bench_tests)} tasks left after filtering for SWE-Gym verified instances'
|
||||
)
|
||||
|
||||
llm_config = None
|
||||
|
||||
@ -9,7 +9,7 @@ parser.add_argument(
|
||||
'--dataset_name',
|
||||
type=str,
|
||||
help='Name of the dataset to download',
|
||||
default='princeton-nlp/SWE-bench_Lite',
|
||||
default='princeton-nlp/SWE-bench_Verified',
|
||||
)
|
||||
parser.add_argument('--split', type=str, help='Split to download', default='test')
|
||||
args = parser.parse_args()
|
||||
@ -20,7 +20,12 @@ print(
|
||||
f'Downloading gold patches from {args.dataset_name} (split: {args.split}) to {output_filepath}'
|
||||
)
|
||||
patches = [
|
||||
{'instance_id': row['instance_id'], 'model_patch': row['patch']} for row in dataset
|
||||
{
|
||||
'instance_id': row['instance_id'],
|
||||
'model_patch': row['patch'],
|
||||
'model_name_or_path': 'gold',
|
||||
}
|
||||
for row in dataset
|
||||
]
|
||||
print(f'{len(patches)} gold patches loaded')
|
||||
pd.DataFrame(patches).to_json(output_filepath, lines=True, orient='records')
|
||||
|
||||
2121
evaluation/benchmarks/swe_bench/split/swegym_verified_instances.json
Normal file
2121
evaluation/benchmarks/swe_bench/split/swegym_verified_instances.json
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user