mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
Co-authored-by: Robert Brennan <accounts@rbren.io> Co-authored-by: openhands <openhands@all-hands.dev> Co-authored-by: Engel Nyst <enyst@users.noreply.github.com> Co-authored-by: Graham Neubig <neubig@gmail.com>
33 lines
997 B
Python
33 lines
997 B
Python
import argparse
|
|
|
|
import pandas as pd
|
|
from datasets import load_dataset
|
|
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('output_filepath', type=str, help='Path to save the output file')
|
|
parser.add_argument(
|
|
'--dataset_name',
|
|
type=str,
|
|
help='Name of the dataset to download',
|
|
default='princeton-nlp/SWE-bench_Verified',
|
|
)
|
|
parser.add_argument('--split', type=str, help='Split to download', default='test')
|
|
args = parser.parse_args()
|
|
|
|
dataset = load_dataset(args.dataset_name, split=args.split)
|
|
output_filepath = args.output_filepath
|
|
print(
|
|
f'Downloading gold patches from {args.dataset_name} (split: {args.split}) to {output_filepath}'
|
|
)
|
|
patches = [
|
|
{
|
|
'instance_id': row['instance_id'],
|
|
'model_patch': row['patch'],
|
|
'model_name_or_path': 'gold',
|
|
}
|
|
for row in dataset
|
|
]
|
|
print(f'{len(patches)} gold patches loaded')
|
|
pd.DataFrame(patches).to_json(output_filepath, lines=True, orient='records')
|
|
print(f'Patches saved to {output_filepath}')
|