mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
feat(eval): misc SWE-Bench improvement - use different resources for different instances (#6313)
Co-authored-by: openhands <openhands@all-hands.dev>
This commit is contained in:
parent
0661c69bd3
commit
72af7bbba2
@ -1,3 +1,4 @@
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
import time
|
||||
@ -11,7 +12,11 @@ from swebench.harness.run_evaluation import (
|
||||
)
|
||||
from swebench.harness.test_spec import SWEbenchInstance, TestSpec, make_test_spec
|
||||
from swebench.harness.utils import load_swebench_dataset
|
||||
from tqdm import tqdm
|
||||
|
||||
from evaluation.benchmarks.swe_bench.resource.mapping import (
|
||||
get_instance_resource_factor,
|
||||
)
|
||||
from evaluation.benchmarks.swe_bench.run_infer import get_instance_docker_image
|
||||
from evaluation.utils.shared import (
|
||||
EvalMetadata,
|
||||
@ -81,10 +86,14 @@ def get_config(instance: pd.Series) -> AppConfig:
|
||||
base_container_image=base_container_image,
|
||||
use_host_network=False,
|
||||
# large enough timeout, since some testcases take very long to run
|
||||
timeout=1800,
|
||||
timeout=600,
|
||||
api_key=os.environ.get('ALLHANDS_API_KEY', None),
|
||||
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
|
||||
remote_runtime_init_timeout=3600,
|
||||
remote_runtime_resource_factor=get_instance_resource_factor(
|
||||
dataset_name=metadata.dataset,
|
||||
instance_id=instance['instance_id'],
|
||||
),
|
||||
),
|
||||
# do not mount workspace
|
||||
workspace_base=None,
|
||||
@ -151,52 +160,52 @@ def process_instance(
|
||||
if runtime_failure_count > 0:
|
||||
config.sandbox.remote_runtime_resource_factor = min(
|
||||
config.sandbox.remote_runtime_resource_factor * (2**runtime_failure_count),
|
||||
4, # hardcode maximum resource factor to 4
|
||||
8,
|
||||
)
|
||||
logger.warning(
|
||||
f'This is the second attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}'
|
||||
f'This is the {runtime_failure_count + 1}th attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}'
|
||||
)
|
||||
|
||||
runtime = create_runtime(config)
|
||||
call_async_from_sync(runtime.connect)
|
||||
# Get patch and save it to /tmp/patch.diff
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
# Patch file
|
||||
patch_file_path = os.path.join(temp_dir, 'patch.diff')
|
||||
with open(patch_file_path, 'w') as f:
|
||||
f.write(model_patch)
|
||||
runtime.copy_to(patch_file_path, '/tmp')
|
||||
# Eval script
|
||||
eval_script_path = os.path.join(temp_dir, 'eval.sh')
|
||||
with open(eval_script_path, 'w') as f:
|
||||
f.write(test_spec.eval_script)
|
||||
runtime.copy_to(eval_script_path, '/tmp')
|
||||
|
||||
# Set +x
|
||||
action = CmdRunAction(command='chmod +x /tmp/eval.sh')
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert obs.exit_code == 0
|
||||
|
||||
# Apply patch
|
||||
exec_command = (
|
||||
'cd /testbed && '
|
||||
"(git apply -v /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || "
|
||||
"(echo 'Failed to apply patch with git apply, trying with patch command...' && "
|
||||
"(patch --batch --fuzz=5 -p1 -i /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || "
|
||||
"echo 'APPLY_PATCH_FAIL')))"
|
||||
)
|
||||
action = CmdRunAction(command=exec_command)
|
||||
action.set_hard_timeout(600)
|
||||
obs = runtime.run_action(action)
|
||||
assert isinstance(obs, CmdOutputObservation)
|
||||
apply_patch_output = obs.content
|
||||
assert isinstance(apply_patch_output, str)
|
||||
instance['test_result']['apply_patch_output'] = apply_patch_output
|
||||
|
||||
try:
|
||||
runtime = create_runtime(config)
|
||||
call_async_from_sync(runtime.connect)
|
||||
# Get patch and save it to /tmp/patch.diff
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
# Patch file
|
||||
patch_file_path = os.path.join(temp_dir, 'patch.diff')
|
||||
with open(patch_file_path, 'w') as f:
|
||||
f.write(model_patch)
|
||||
runtime.copy_to(patch_file_path, '/tmp')
|
||||
# Eval script
|
||||
eval_script_path = os.path.join(temp_dir, 'eval.sh')
|
||||
with open(eval_script_path, 'w') as f:
|
||||
f.write(test_spec.eval_script)
|
||||
runtime.copy_to(eval_script_path, '/tmp')
|
||||
|
||||
# Set +x
|
||||
action = CmdRunAction(command='chmod +x /tmp/eval.sh')
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
assert obs.exit_code == 0
|
||||
|
||||
# Apply patch
|
||||
exec_command = (
|
||||
'cd /testbed && '
|
||||
"(git apply -v /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || "
|
||||
"(echo 'Failed to apply patch with git apply, trying with patch command...' && "
|
||||
"(patch --batch --fuzz=5 -p1 -i /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || "
|
||||
"echo 'APPLY_PATCH_FAIL')))"
|
||||
)
|
||||
action = CmdRunAction(command=exec_command)
|
||||
action.set_hard_timeout(600)
|
||||
obs = runtime.run_action(action)
|
||||
assert isinstance(obs, CmdOutputObservation)
|
||||
apply_patch_output = obs.content
|
||||
assert isinstance(apply_patch_output, str)
|
||||
instance['test_result']['apply_patch_output'] = apply_patch_output
|
||||
|
||||
if 'APPLY_PATCH_FAIL' in apply_patch_output:
|
||||
logger.info(f'[{instance_id}] {APPLY_PATCH_FAIL}:\n{apply_patch_output}')
|
||||
instance['test_result']['report']['failed_apply_patch'] = True
|
||||
@ -212,7 +221,7 @@ def process_instance(
|
||||
# Run eval script in background and save output to log file
|
||||
log_file = '/tmp/eval_output.log'
|
||||
action = CmdRunAction(command=f'/tmp/eval.sh > {log_file} 2>&1 & echo $!')
|
||||
action.set_hard_timeout(60) # Short timeout just to get the process ID
|
||||
action.set_hard_timeout(300) # Short timeout just to get the process ID
|
||||
obs = runtime.run_action(action)
|
||||
|
||||
if isinstance(obs, CmdOutputObservation) and obs.exit_code == 0:
|
||||
@ -235,7 +244,7 @@ def process_instance(
|
||||
check_action = CmdRunAction(
|
||||
command=f'ps -p {pid} > /dev/null; echo $?'
|
||||
)
|
||||
check_action.set_hard_timeout(60)
|
||||
check_action.set_hard_timeout(300)
|
||||
check_obs = runtime.run_action(check_action)
|
||||
if (
|
||||
isinstance(check_obs, CmdOutputObservation)
|
||||
@ -352,7 +361,14 @@ if __name__ == '__main__':
|
||||
|
||||
# Load predictions
|
||||
assert args.input_file.endswith('.jsonl'), 'Input file must be a jsonl file.'
|
||||
predictions = pd.read_json(args.input_file, lines=True)
|
||||
required_fields = ['instance_id', 'model_patch', 'test_result']
|
||||
with open(args.input_file) as f:
|
||||
predictions = pd.DataFrame.from_records(
|
||||
[
|
||||
{k: v for k, v in json.loads(line).items() if k in required_fields}
|
||||
for line in tqdm(f, desc='Loading predictions')
|
||||
]
|
||||
)
|
||||
assert (
|
||||
'instance_id' in predictions.columns
|
||||
), 'Input file must contain instance_id column.'
|
||||
|
||||
38
evaluation/benchmarks/swe_bench/resource/mapping.py
Normal file
38
evaluation/benchmarks/swe_bench/resource/mapping.py
Normal file
@ -0,0 +1,38 @@
|
||||
"""Mapping instance_id to resource_factor.
|
||||
|
||||
Different instances may have different resource requirements.
|
||||
e.g., some instances may require more memory/CPU to run inference.
|
||||
This file tracks the resource requirements of different instances.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
from openhands.core.logger import openhands_logger as logger
|
||||
|
||||
CUR_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
DEFAULT_RUNTIME_RESOURCE_FACTOR = int(
|
||||
os.environ.get('DEFAULT_RUNTIME_RESOURCE_FACTOR', 1)
|
||||
)
|
||||
|
||||
# dataset to resource mapping
|
||||
_global_resource_mapping: dict[str, dict[str, float]] = {}
|
||||
|
||||
|
||||
def get_resource_mapping(dataset_name: str) -> dict[str, float]:
|
||||
if dataset_name not in _global_resource_mapping:
|
||||
file_path = os.path.join(CUR_DIR, f'{dataset_name}.json')
|
||||
if not os.path.exists(file_path):
|
||||
logger.warning(f'Resource mapping for {dataset_name} not found.')
|
||||
return None
|
||||
|
||||
with open(file_path, 'r') as f:
|
||||
_global_resource_mapping[dataset_name] = json.load(f)
|
||||
logger.info(f'Loaded resource mapping for {dataset_name}')
|
||||
return _global_resource_mapping[dataset_name]
|
||||
|
||||
|
||||
def get_instance_resource_factor(dataset_name: str, instance_id: str) -> int:
|
||||
resource_mapping = get_resource_mapping(dataset_name)
|
||||
if resource_mapping is None:
|
||||
return DEFAULT_RUNTIME_RESOURCE_FACTOR
|
||||
return int(resource_mapping.get(instance_id, DEFAULT_RUNTIME_RESOURCE_FACTOR))
|
||||
@ -0,0 +1 @@
|
||||
{"pydata__xarray-6721": 8, "pytest-dev__pytest-7236": 8, "matplotlib__matplotlib-24627": 4, "django__django-15561": 4, "django__django-15098": 4, "django__django-14771": 4, "sympy__sympy-21612": 4, "sympy__sympy-15345": 4, "psf__requests-5414": 4, "astropy__astropy-14508": 2, "django__django-11451": 2, "django__django-11477": 2, "django__django-10880": 2, "django__django-11163": 2, "django__django-11815": 2, "astropy__astropy-14369": 2, "django__django-10097": 2, "django__django-10554": 2, "django__django-12304": 2, "django__django-12325": 2, "django__django-11551": 2, "django__django-11734": 2, "django__django-13109": 2, "django__django-13089": 2, "django__django-13343": 2, "django__django-13363": 2, "django__django-13809": 2, "django__django-13810": 2, "django__django-13786": 2, "django__django-13807": 2, "django__django-14493": 2, "django__django-11820": 2, "django__django-11951": 2, "django__django-11964": 2, "astropy__astropy-14309": 2, "astropy__astropy-14365": 2, "astropy__astropy-12907": 2, "astropy__astropy-14182": 2, "django__django-15161": 2, "django__django-15128": 2, "django__django-14999": 2, "django__django-14915": 2, "django__django-14752": 2, "django__django-14765": 2, "django__django-14089": 2, "django__django-15252": 2, "django__django-15380": 2, "django__django-15382": 2, "django__django-15499": 2, "django__django-15467": 2, "django__django-15280": 2, "django__django-15315": 2, "django__django-15277": 2, "django__django-15268": 2, "django__django-15629": 2, "django__django-15695": 2, "django__django-15732": 2, "django__django-15863": 2, "django__django-16082": 2, "django__django-16145": 2, "django__django-16256": 2, "django__django-16429": 2, "django__django-16454": 2, "django__django-16493": 2, "matplotlib__matplotlib-13989": 2, "matplotlib__matplotlib-20488": 2, "django__django-15503": 2, "django__django-15525": 2, "django__django-15375": 2, "django__django-15278": 2, "matplotlib__matplotlib-21568": 2, "matplotlib__matplotlib-20859": 2, "matplotlib__matplotlib-20826": 2, "matplotlib__matplotlib-20676": 2, "matplotlib__matplotlib-23412": 2, "matplotlib__matplotlib-22719": 2, "matplotlib__matplotlib-23299": 2, "matplotlib__matplotlib-22865": 2, "matplotlib__matplotlib-24149": 2, "matplotlib__matplotlib-24177": 2, "matplotlib__matplotlib-24570": 2, "matplotlib__matplotlib-24637": 2, "matplotlib__matplotlib-24970": 2, "matplotlib__matplotlib-23476": 2, "matplotlib__matplotlib-24026": 2, "matplotlib__matplotlib-23314": 2, "matplotlib__matplotlib-25332": 2, "matplotlib__matplotlib-25311": 2, "matplotlib__matplotlib-25122": 2, "matplotlib__matplotlib-25479": 2, "matplotlib__matplotlib-26342": 2, "psf__requests-2317": 2, "matplotlib__matplotlib-25960": 2, "matplotlib__matplotlib-25775": 2, "pydata__xarray-4356": 2, "pydata__xarray-4075": 2, "pydata__xarray-6461": 2, "pydata__xarray-4687": 2, "pydata__xarray-6599": 2, "pylint-dev__pylint-4661": 2, "django__django-15554": 2, "django__django-15563": 2, "pytest-dev__pytest-5262": 2, "pytest-dev__pytest-10081": 2, "scikit-learn__scikit-learn-12973": 2, "scikit-learn__scikit-learn-13124": 2, "scikit-learn__scikit-learn-13779": 2, "scikit-learn__scikit-learn-14141": 2, "scikit-learn__scikit-learn-13439": 2, "scikit-learn__scikit-learn-13496": 2, "scikit-learn__scikit-learn-15100": 2, "scikit-learn__scikit-learn-25102": 2, "scikit-learn__scikit-learn-25232": 2, "scikit-learn__scikit-learn-25747": 2, "scikit-learn__scikit-learn-26323": 2, "scikit-learn__scikit-learn-9288": 2, "scikit-learn__scikit-learn-14496": 2, "scikit-learn__scikit-learn-14629": 2, "sphinx-doc__sphinx-8265": 2, "sphinx-doc__sphinx-8548": 2, "sphinx-doc__sphinx-8593": 2, "sphinx-doc__sphinx-8595": 2, "sphinx-doc__sphinx-8621": 2, "sphinx-doc__sphinx-8638": 2, "sphinx-doc__sphinx-9229": 2, "sphinx-doc__sphinx-9281": 2, "sphinx-doc__sphinx-9461": 2, "sphinx-doc__sphinx-9591": 2, "sphinx-doc__sphinx-9658": 2, "sphinx-doc__sphinx-9673": 2, "sympy__sympy-12096": 2, "sympy__sympy-12481": 2, "sphinx-doc__sphinx-10323": 2, "sphinx-doc__sphinx-7590": 2, "sympy__sympy-13877": 2, "sympy__sympy-12489": 2, "sympy__sympy-15809": 2, "sympy__sympy-14711": 2, "sympy__sympy-16597": 2, "sympy__sympy-16766": 2, "sympy__sympy-16792": 2, "sympy__sympy-15875": 2, "sympy__sympy-17655": 2, "sympy__sympy-18189": 2, "sympy__sympy-18763": 2, "sympy__sympy-19040": 2, "sympy__sympy-19495": 2, "sympy__sympy-19637": 2, "sympy__sympy-19783": 2, "sympy__sympy-17630": 2, "sympy__sympy-20428": 2, "sympy__sympy-20590": 2, "sympy__sympy-20801": 2, "sympy__sympy-21379": 2, "sympy__sympy-21847": 2, "sympy__sympy-22456": 2, "sympy__sympy-22714": 2, "sympy__sympy-22914": 2, "sympy__sympy-23262": 2, "sympy__sympy-23413": 2, "sympy__sympy-23534": 2, "sympy__sympy-24066": 2, "sympy__sympy-24213": 2, "sympy__sympy-24443": 2, "sympy__sympy-24562": 2, "sympy__sympy-24661": 2}
|
||||
@ -9,6 +9,9 @@ import toml
|
||||
from datasets import load_dataset
|
||||
|
||||
import openhands.agenthub
|
||||
from evaluation.benchmarks.swe_bench.resource.mapping import (
|
||||
get_instance_resource_factor,
|
||||
)
|
||||
from evaluation.utils.shared import (
|
||||
EvalException,
|
||||
EvalMetadata,
|
||||
@ -41,9 +44,10 @@ from openhands.utils.async_utils import call_async_from_sync
|
||||
from openhands.utils.shutdown_listener import sleep_if_should_continue
|
||||
|
||||
USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true'
|
||||
USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false').lower() == 'true'
|
||||
USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'true').lower() == 'true'
|
||||
RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'true'
|
||||
|
||||
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
||||
'CodeActAgent': codeact_user_response,
|
||||
}
|
||||
@ -135,6 +139,10 @@ def get_config(
|
||||
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
|
||||
keep_runtime_alive=False,
|
||||
remote_runtime_init_timeout=3600,
|
||||
remote_runtime_resource_factor=get_instance_resource_factor(
|
||||
dataset_name=metadata.dataset,
|
||||
instance_id=instance['instance_id'],
|
||||
),
|
||||
),
|
||||
# do not mount workspace
|
||||
workspace_base=None,
|
||||
@ -239,7 +247,7 @@ def initialize_runtime(
|
||||
assert_and_raise(obs.exit_code == 0, f'Failed to source ~/.bashrc: {str(obs)}')
|
||||
|
||||
action = CmdRunAction(command='source /swe_util/instance_swe_entry.sh')
|
||||
action.set_hard_timeout(3600)
|
||||
action.set_hard_timeout(600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
@ -351,7 +359,7 @@ def complete_runtime(
|
||||
action = CmdRunAction(
|
||||
command=f'git diff --no-color --cached {instance["base_commit"]}'
|
||||
)
|
||||
action.set_hard_timeout(600 + 100 * n_retries)
|
||||
action.timeout = max(300 + 100 * n_retries, 600)
|
||||
logger.info(action, extra={'msg_type': 'ACTION'})
|
||||
obs = runtime.run_action(action)
|
||||
logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
||||
@ -399,7 +407,7 @@ def process_instance(
|
||||
8,
|
||||
)
|
||||
logger.warning(
|
||||
f'This is the second attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}'
|
||||
f'This is the {runtime_failure_count + 1}th attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}'
|
||||
)
|
||||
runtime = create_runtime(config)
|
||||
call_async_from_sync(runtime.connect)
|
||||
@ -479,6 +487,10 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
|
||||
subset = dataset[dataset[filter_column].isin(selected_ids)]
|
||||
logger.info(f'Retained {subset.shape[0]} tasks after filtering')
|
||||
return subset
|
||||
skip_ids = os.environ.get('SKIP_IDS', '').split(',')
|
||||
if len(skip_ids) > 0:
|
||||
logger.info(f'Filtering {len(skip_ids)} tasks from "SKIP_IDS"...')
|
||||
return dataset[~dataset[filter_column].isin(skip_ids)]
|
||||
return dataset
|
||||
|
||||
|
||||
@ -501,8 +513,10 @@ if __name__ == '__main__':
|
||||
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
|
||||
# so we don't need to manage file uploading to OpenHands's repo
|
||||
dataset = load_dataset(args.dataset, split=args.split)
|
||||
logger.info(f'Loaded dataset {args.dataset} with split {args.split}')
|
||||
swe_bench_tests = filter_dataset(dataset.to_pandas(), 'instance_id')
|
||||
logger.info(
|
||||
f'Loaded dataset {args.dataset} with split {args.split}: {len(swe_bench_tests)} tasks'
|
||||
)
|
||||
|
||||
llm_config = None
|
||||
if args.llm_config:
|
||||
@ -531,6 +545,7 @@ if __name__ == '__main__':
|
||||
)
|
||||
|
||||
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
||||
print(f'### OUTPUT FILE: {output_file} ###')
|
||||
instances = prepare_dataset(swe_bench_tests, output_file, args.eval_n_limit)
|
||||
|
||||
if len(instances) > 0 and not isinstance(
|
||||
|
||||
@ -0,0 +1,69 @@
|
||||
import argparse
|
||||
import gzip
|
||||
import json
|
||||
import os
|
||||
from glob import glob
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
tqdm.pandas()
|
||||
|
||||
|
||||
# Load trajectories for resolved instances
|
||||
def load_completions(output_dir: str, instance_id: str):
|
||||
glob_path = os.path.join(output_dir, 'llm_completions', instance_id, '*.json')
|
||||
files = sorted(glob(glob_path)) # this is ascending order
|
||||
# pick the last file (last turn)
|
||||
try:
|
||||
file_path = files[-1]
|
||||
except IndexError:
|
||||
# print(f'No files found for instance {instance_id}: files={files}')
|
||||
return None
|
||||
with open(file_path, 'r') as f:
|
||||
result = json.load(f)
|
||||
# create messages
|
||||
messages = result['messages']
|
||||
messages.append(result['response']['choices'][0]['message'])
|
||||
tools = result['kwargs']['tools']
|
||||
return {
|
||||
'messages': messages,
|
||||
'tools': tools,
|
||||
}
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('jsonl_path', type=str)
|
||||
args = parser.parse_args()
|
||||
|
||||
output_dir = os.path.dirname(args.jsonl_path)
|
||||
output_path = os.path.join(output_dir, 'output.with_completions.jsonl.gz')
|
||||
|
||||
# Check if output would be different from input
|
||||
needs_update = False
|
||||
with open(args.jsonl_path, 'r') as f_in:
|
||||
for line in tqdm(f_in, desc='Checking for changes'):
|
||||
data = json.loads(line)
|
||||
new_completions = load_completions(output_dir, data['instance_id'])
|
||||
current_completions = data.get('raw_completions')
|
||||
if current_completions != new_completions:
|
||||
needs_update = True
|
||||
break
|
||||
|
||||
if not needs_update:
|
||||
print('No updates required. Skipping file update.')
|
||||
exit(0)
|
||||
|
||||
if os.path.exists(output_path):
|
||||
print(f'Output file already exists at {output_path}, overwriting? (y/n)')
|
||||
if input() != 'y':
|
||||
print('Exiting...')
|
||||
exit(0)
|
||||
|
||||
# Process line by line
|
||||
with open(args.jsonl_path, 'r') as f_in, gzip.open(output_path, 'wt') as f_out:
|
||||
for line in tqdm(f_in):
|
||||
data = json.loads(line)
|
||||
data['raw_completions'] = load_completions(output_dir, data['instance_id'])
|
||||
f_out.write(json.dumps(data) + '\n')
|
||||
|
||||
print(f'Saved compressed output to {output_path}')
|
||||
@ -22,7 +22,8 @@ def convert_row_to_swebench_format(row):
|
||||
elif 'test_result' in row and 'git_patch' in row['test_result']:
|
||||
model_patch = row['test_result']['git_patch']
|
||||
else:
|
||||
raise ValueError(f'Row {row} does not have a git_patch')
|
||||
print(f'WARNING: Row {row} does not have a git_patch')
|
||||
model_patch = ''
|
||||
|
||||
return {
|
||||
'instance_id': row['instance_id'],
|
||||
|
||||
@ -3,7 +3,7 @@ import json
|
||||
import os
|
||||
from collections import defaultdict
|
||||
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('input_file', type=str)
|
||||
@ -11,8 +11,7 @@ args = parser.parse_args()
|
||||
|
||||
dirname = os.path.dirname(args.input_file)
|
||||
|
||||
df = pd.read_json(args.input_file, lines=True)
|
||||
|
||||
# Initialize counters and data structures
|
||||
instance_id_to_status = defaultdict(
|
||||
lambda: {
|
||||
'empty_generation': False,
|
||||
@ -23,15 +22,7 @@ instance_id_to_status = defaultdict(
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
# Apply the status to the dataframe
|
||||
def apply_report(row):
|
||||
instance_id = row['instance_id']
|
||||
if instance_id in instance_id_to_status:
|
||||
return dict(instance_id_to_status[instance_id])
|
||||
return row.get('report', {})
|
||||
|
||||
|
||||
# Process official report if it exists
|
||||
swebench_official_report_json = os.path.join(dirname, 'report.json')
|
||||
openhands_remote_report_jsonl = args.input_file.replace(
|
||||
'.jsonl', '.swebench_eval.jsonl'
|
||||
@ -90,113 +81,159 @@ if os.path.exists(swebench_official_report_json):
|
||||
f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
|
||||
)
|
||||
|
||||
df['report'] = df.apply(apply_report, axis=1)
|
||||
|
||||
with open(output_md_filepath, 'w') as f:
|
||||
f.write(output_md)
|
||||
|
||||
elif os.path.exists(openhands_remote_report_jsonl):
|
||||
output_md_filepath = args.input_file.replace('.jsonl', '.swebench_eval.md')
|
||||
|
||||
df_eval = pd.read_json(openhands_remote_report_jsonl, lines=True, orient='records')
|
||||
# First pass: Read eval report and count instances
|
||||
instance_ids = set()
|
||||
eval_instance_ids = set()
|
||||
|
||||
assert len(df['instance_id'].unique()) == len(
|
||||
df
|
||||
), 'There are duplicate instance ids in the original output which is not allowed'
|
||||
assert len(df_eval['instance_id'].unique()) == len(
|
||||
df_eval
|
||||
), 'There are duplicate instance ids in the eval report which is not allowed'
|
||||
# Count instances in original file
|
||||
n_instances = 0
|
||||
with open(args.input_file, 'r') as f:
|
||||
for line in tqdm(f, desc='Counting instances in original file'):
|
||||
data = json.loads(line)
|
||||
instance_ids.add(data['instance_id'])
|
||||
n_instances += 1
|
||||
print(f'Total instances in original file: {n_instances}')
|
||||
|
||||
for _, row in df_eval.iterrows():
|
||||
instance_id_to_status[row['instance_id']] = row['test_result']['report']
|
||||
df['report'] = df.apply(apply_report, axis=1)
|
||||
# Process eval report
|
||||
n_eval_instances = 0
|
||||
with open(openhands_remote_report_jsonl, 'r') as f:
|
||||
for line in tqdm(f, desc='Processing eval report'):
|
||||
data = json.loads(line)
|
||||
instance_id = data['instance_id']
|
||||
eval_instance_ids.add(instance_id)
|
||||
n_eval_instances += 1
|
||||
instance_id_to_status[instance_id] = data['test_result']['report']
|
||||
print(f'Total instances in eval report: {n_eval_instances}')
|
||||
|
||||
report_is_dict = df['report'].apply(lambda x: isinstance(x, dict))
|
||||
if not report_is_dict.all():
|
||||
print(df[~report_is_dict])
|
||||
raise ValueError(f'Report is not a dict, but a {type(row["report"])}')
|
||||
# Verify no duplicates
|
||||
assert (
|
||||
len(instance_ids) == n_instances
|
||||
), 'Duplicate instance ids found in original output'
|
||||
assert (
|
||||
len(eval_instance_ids) == n_eval_instances
|
||||
), 'Duplicate instance ids found in eval report'
|
||||
|
||||
_n_instances = len(df)
|
||||
_n_resolved = len(df[df['report'].apply(lambda x: x.get('resolved', False))])
|
||||
_n_unresolved = _n_instances - _n_resolved
|
||||
_n_empty_patch = len(
|
||||
df[df['report'].apply(lambda x: x.get('empty_generation', False))]
|
||||
)
|
||||
_n_error = len(df[df['report'].apply(lambda x: x.get('error_eval', False))])
|
||||
# Initialize counters
|
||||
stats = {'total': len(instance_ids), 'resolved': 0, 'empty_patch': 0, 'error': 0}
|
||||
|
||||
# Collect instance IDs by category
|
||||
resolved_ids = []
|
||||
unresolved_ids = []
|
||||
error_ids = []
|
||||
empty_patch_ids = []
|
||||
timeout_ids = []
|
||||
|
||||
# Process original file and categorize instances
|
||||
with open(args.input_file, 'r') as f:
|
||||
for line in f:
|
||||
data = json.loads(line)
|
||||
instance_id = data['instance_id']
|
||||
report = instance_id_to_status[instance_id]
|
||||
|
||||
if report.get('resolved', False):
|
||||
stats['resolved'] += 1
|
||||
resolved_ids.append(instance_id)
|
||||
else:
|
||||
unresolved_ids.append(instance_id)
|
||||
|
||||
if report.get('empty_generation', False):
|
||||
stats['empty_patch'] += 1
|
||||
empty_patch_ids.append(instance_id)
|
||||
if report.get('error_eval', False):
|
||||
stats['error'] += 1
|
||||
error_ids.append(instance_id)
|
||||
if report.get('test_timeout', False):
|
||||
timeout_ids.append(instance_id)
|
||||
|
||||
# Generate markdown report
|
||||
def _instance_id_to_log_path(instance_id):
|
||||
path = f"{args.input_file.replace('.jsonl', '.swebench_eval.logs')}/instance_{instance_id}.log"
|
||||
return os.path.relpath(path, start=dirname)
|
||||
|
||||
# ... rest of markdown generation code remains the same ...
|
||||
output_md = (
|
||||
'# SWE-bench Report\n'
|
||||
'This folder contains the evaluation results of the SWE-bench using the [official evaluation docker containerization](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md#choosing-the-right-cache_level).\n\n'
|
||||
'## Summary\n'
|
||||
f'- submitted instances: {_n_instances}\n'
|
||||
f'- empty patch instances: {_n_empty_patch}\n'
|
||||
f'- resolved instances: {_n_resolved}\n'
|
||||
f'- unresolved instances: {_n_unresolved}\n'
|
||||
f'- error instances: {_n_error}\n'
|
||||
f'- submitted instances: {stats["total"]}\n'
|
||||
f'- empty patch instances: {stats["empty_patch"]}\n'
|
||||
f'- resolved instances: {stats["resolved"]}\n'
|
||||
f'- unresolved instances: {len(unresolved_ids)}\n'
|
||||
f'- error instances: {stats["error"]}\n'
|
||||
)
|
||||
|
||||
def _instance_id_to_log_path(instance_id):
|
||||
path = f"{args.input_file.replace('.jsonl', '.swebench_eval.logs')}/instance_{instance_id}.log"
|
||||
# make it relative path
|
||||
path = os.path.relpath(path, start=dirname)
|
||||
return path
|
||||
|
||||
output_md += '\n## Resolved Instances\n'
|
||||
# instance_id to status
|
||||
for instance_id in sorted(
|
||||
df[df['report'].apply(lambda x: x.get('resolved', False))][
|
||||
'instance_id'
|
||||
].unique()
|
||||
):
|
||||
for instance_id in resolved_ids:
|
||||
instance_id_to_status[instance_id]['resolved'] = True
|
||||
output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
|
||||
|
||||
output_md += '\n## Unresolved Instances\n'
|
||||
for instance_id in sorted(
|
||||
df[~df['report'].apply(lambda x: x.get('resolved', False))][
|
||||
'instance_id'
|
||||
].unique()
|
||||
):
|
||||
for instance_id in unresolved_ids:
|
||||
output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
|
||||
|
||||
output_md += '\n## Error Instances\n'
|
||||
for instance_id in sorted(
|
||||
df[df['report'].apply(lambda x: x.get('error_eval', False))][
|
||||
'instance_id'
|
||||
].unique()
|
||||
):
|
||||
for instance_id in error_ids:
|
||||
instance_id_to_status[instance_id]['error_eval'] = True
|
||||
output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
|
||||
|
||||
output_md += '\n## Empty Patch Instances\n'
|
||||
for instance_id in sorted(
|
||||
df[df['report'].apply(lambda x: x.get('empty_generation', False))][
|
||||
'instance_id'
|
||||
].unique()
|
||||
):
|
||||
for instance_id in empty_patch_ids:
|
||||
instance_id_to_status[instance_id]['empty_generation'] = True
|
||||
output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
|
||||
|
||||
output_md += '\n## Incomplete Instances\n'
|
||||
for instance_id in sorted(
|
||||
df[df['report'].apply(lambda x: x.get('test_timeout', False))][
|
||||
'instance_id'
|
||||
].unique()
|
||||
):
|
||||
for instance_id in timeout_ids:
|
||||
output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
|
||||
|
||||
with open(output_md_filepath, 'w') as f:
|
||||
f.write(output_md)
|
||||
|
||||
else:
|
||||
print(
|
||||
f'No report file found: Both {swebench_official_report_json} and {openhands_remote_report_jsonl} do not exist.'
|
||||
)
|
||||
exit()
|
||||
|
||||
# Before backup and update, check if any changes would be made
|
||||
needs_update = False
|
||||
with open(args.input_file, 'r') as infile:
|
||||
for line in tqdm(infile, desc='Checking for changes'):
|
||||
data = json.loads(line)
|
||||
instance_id = data['instance_id']
|
||||
if instance_id in instance_id_to_status:
|
||||
current_report = data.get('report', {})
|
||||
new_report = instance_id_to_status[instance_id]
|
||||
if current_report != new_report:
|
||||
needs_update = True
|
||||
break
|
||||
|
||||
if not needs_update:
|
||||
print('No updates detected. Skipping file update.')
|
||||
exit()
|
||||
|
||||
# Backup and update the original file row by row
|
||||
if os.path.exists(args.input_file + '.bak'):
|
||||
conf = input('Existing backup file found. Do you want to overwrite it? (y/n)')
|
||||
if conf != 'y':
|
||||
exit()
|
||||
os.remove(args.input_file + '.bak')
|
||||
|
||||
# backup the original file
|
||||
os.rename(args.input_file, args.input_file + '.bak')
|
||||
df.to_json(args.input_file, orient='records', lines=True)
|
||||
|
||||
# Process and write file row by row
|
||||
with open(args.input_file + '.bak', 'r') as infile, open(
|
||||
args.input_file, 'w'
|
||||
) as outfile:
|
||||
for line in tqdm(infile, desc='Updating output file'):
|
||||
data = json.loads(line)
|
||||
instance_id = data['instance_id']
|
||||
if instance_id in instance_id_to_status:
|
||||
data['report'] = instance_id_to_status[instance_id]
|
||||
outfile.write(json.dumps(data) + '\n')
|
||||
|
||||
@ -108,7 +108,14 @@ if [ -z "$N_RUNS" ]; then
|
||||
echo "N_RUNS not specified, use default $N_RUNS"
|
||||
fi
|
||||
|
||||
# Skip runs if the run number is in the SKIP_RUNS list
|
||||
# read from env variable SKIP_RUNS as a comma separated list of run numbers
|
||||
SKIP_RUNS=(${SKIP_RUNS//,/ })
|
||||
for i in $(seq 1 $N_RUNS); do
|
||||
if [[ " ${SKIP_RUNS[@]} " =~ " $i " ]]; then
|
||||
echo "Skipping run $i"
|
||||
continue
|
||||
fi
|
||||
current_eval_note="$EVAL_NOTE-run_$i"
|
||||
echo "EVAL_NOTE: $current_eval_note"
|
||||
run_eval $current_eval_note
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user