[eval] improve update output script for swe-bench (#4180)

This commit is contained in:
Xingyao Wang 2024-10-04 10:10:03 -05:00 committed by GitHub
parent 80a631361b
commit 245334e89d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 208 additions and 20 deletions

View File

@ -12,16 +12,22 @@ n_runtimes=$(echo $response | jq -r '.total')
echo "Found ${n_runtimes} runtimes. Stopping them..."
runtime_ids=$(echo $response | jq -r '.runtimes | .[].runtime_id')
# Loop through each runtime and stop it
counter=1
for runtime_id in $runtime_ids; do
# Function to stop a single runtime
stop_runtime() {
local runtime_id=$1
local counter=$2
echo "Stopping runtime ${counter}/${n_runtimes}: ${runtime_id}"
curl --silent --location --request POST "${BASE_URL}/stop" \
--header "X-API-Key: ${ALLHANDS_API_KEY}" \
--header "Content-Type: application/json" \
--data-raw "{\"runtime_id\": \"${runtime_id}\"}"
echo
((counter++))
done
}
export -f stop_runtime
export BASE_URL ALLHANDS_API_KEY n_runtimes
# Use GNU Parallel to stop runtimes in parallel
echo "$runtime_ids" | parallel -j 16 --progress stop_runtime {} {#}
echo "All runtimes have been stopped."

View File

@ -0,0 +1,76 @@
#!/usr/bin/env python3
import argparse
import json
from collections import Counter
ERROR_KEYWORDS = [
'Agent encountered an error while processing the last action',
'APIError',
'Action execution failed',
]
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('output_file', type=str, help='The file to summarize')
args = parser.parse_args()
with open(args.output_file, 'r') as file:
lines = file.readlines()
num_lines = len(lines)
num_error_lines = 0
num_agent_stuck_in_loop = 0
num_resolved = 0
num_empty_patch = 0
error_counter = Counter()
for line in lines:
_d = json.loads(line)
patch = _d.get('test_result', {}).get('git_patch', '')
if patch == '':
num_empty_patch += 1
continue
report = _d.get('report', {}) or {}
resolved = report.get('resolved', False)
if resolved:
num_resolved += 1
error = _d.get('error', None)
if error is not None and isinstance(error, str):
agent_stuck_in_loop = 'Agent got stuck in a loop' in error
contains_error = bool(error) and not agent_stuck_in_loop
if agent_stuck_in_loop:
error_counter['Agent got stuck in a loop'] += 1
num_agent_stuck_in_loop += 1
elif contains_error:
error_counter[error] += 1
continue
for keyword in ERROR_KEYWORDS:
if keyword in line:
error_counter[keyword] += 1
num_error_lines += 1
break
# print the error counter (with percentage)
print('-' * 100)
print(
f'# of resolved: {num_resolved} / {num_lines} ({num_resolved / num_lines * 100:.2f}%)'
)
print(
f'# of empty patch: {num_empty_patch} / {num_lines} ({num_empty_patch / num_lines * 100:.2f}%)'
)
print(
f'# of error lines: {num_error_lines} / {num_lines} ({num_error_lines / num_lines * 100:.2f}%)'
)
print(
f'# of loop: {num_agent_stuck_in_loop} / {num_lines} ({num_agent_stuck_in_loop / num_lines * 100:.2f}%)'
)
print('-' * 100)
print('Detailed error breakdown:')
for error, count in error_counter.items():
print(f'{error}: {count} ({count / num_lines * 100:.2f}%)')

View File

@ -10,16 +10,36 @@ parser.add_argument('input_file', type=str)
args = parser.parse_args()
dirname = os.path.dirname(args.input_file)
report_json = os.path.join(dirname, 'report.json')
df = pd.read_json(args.input_file, lines=True)
output_md_filepath = os.path.join(dirname, 'README.md')
instance_id_to_status = defaultdict(
lambda: {'resolved': False, 'empty_generation': False}
lambda: {
'empty_generation': False,
'resolved': False,
'failed_apply_patch': False,
'error_eval': False,
'test_timeout': False,
}
)
if os.path.exists(report_json):
with open(report_json, 'r') as f:
# Apply the status to the dataframe
def apply_report(row):
instance_id = row['instance_id']
if instance_id in instance_id_to_status:
return dict(instance_id_to_status[instance_id])
return row.get('report', {})
swebench_official_report_json = os.path.join(dirname, 'report.json')
openhands_remote_report_jsonl = args.input_file.replace(
'.jsonl', '.swebench_eval.jsonl'
)
if os.path.exists(swebench_official_report_json):
output_md_filepath = os.path.join(dirname, 'README.md')
with open(swebench_official_report_json, 'r') as f:
report = json.load(f)
output_md = (
@ -70,15 +90,101 @@ if os.path.exists(report_json):
f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
)
# Apply the status to the dataframe
def apply_report(row):
instance_id = row['instance_id']
if instance_id in instance_id_to_status:
return dict(instance_id_to_status[instance_id])
return row.get('report', {})
df['report'] = df.apply(apply_report, axis=1)
with open(output_md_filepath, 'w') as f:
f.write(output_md)
elif os.path.exists(openhands_remote_report_jsonl):
output_md_filepath = args.input_file.replace('.jsonl', '.swebench_eval.md')
df_eval = pd.read_json(openhands_remote_report_jsonl, lines=True, orient='records')
assert len(df['instance_id'].unique()) == len(
df
), 'There are duplicate instance ids in the original output which is not allowed'
assert len(df_eval['instance_id'].unique()) == len(
df_eval
), 'There are duplicate instance ids in the eval report which is not allowed'
for _, row in df_eval.iterrows():
instance_id_to_status[row['instance_id']] = row['test_result']['report']
df['report'] = df.apply(apply_report, axis=1)
_n_instances = len(df)
_n_resolved = len(df[df['report'].apply(lambda x: x.get('resolved', False))])
_n_unresolved = _n_instances - _n_resolved
_n_empty_patch = len(
df[df['report'].apply(lambda x: x.get('empty_generation', False))]
)
_n_error = len(df[df['report'].apply(lambda x: x.get('error_eval', False))])
output_md = (
'# SWE-bench Report\n'
'This folder contains the evaluation results of the SWE-bench using the [official evaluation docker containerization](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md#choosing-the-right-cache_level).\n\n'
'## Summary\n'
f'- submitted instances: {_n_instances}\n'
f'- empty patch instances: {_n_empty_patch}\n'
f'- resolved instances: {_n_resolved}\n'
f'- unresolved instances: {_n_unresolved}\n'
f'- error instances: {_n_error}\n'
)
def _instance_id_to_log_path(instance_id):
path = f"{args.input_file.replace('.jsonl', '.swebench_eval.logs')}/instance_{instance_id}.log"
# make it relative path
path = os.path.relpath(path, start=dirname)
return path
output_md += '\n## Resolved Instances\n'
# instance_id to status
for instance_id in sorted(
df[df['report'].apply(lambda x: x.get('resolved', False))][
'instance_id'
].unique()
):
instance_id_to_status[instance_id]['resolved'] = True
output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
output_md += '\n## Unresolved Instances\n'
for instance_id in sorted(
df[~df['report'].apply(lambda x: x.get('resolved', False))][
'instance_id'
].unique()
):
output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
output_md += '\n## Error Instances\n'
for instance_id in sorted(
df[df['report'].apply(lambda x: x.get('error_eval', False))][
'instance_id'
].unique()
):
instance_id_to_status[instance_id]['error_eval'] = True
output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
output_md += '\n## Empty Patch Instances\n'
for instance_id in sorted(
df[df['report'].apply(lambda x: x.get('empty_generation', False))][
'instance_id'
].unique()
):
instance_id_to_status[instance_id]['empty_generation'] = True
output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
output_md += '\n## Incomplete Instances\n'
for instance_id in sorted(
df[df['report'].apply(lambda x: x.get('test_timeout', False))][
'instance_id'
].unique()
):
output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
with open(output_md_filepath, 'w') as f:
f.write(output_md)
else:
print(
f'No report file found: Both {swebench_official_report_json} and {openhands_remote_report_jsonl} do not exist.'
)
exit()
if os.path.exists(args.input_file + '.bak'):
conf = input('Existing backup file found. Do you want to overwrite it? (y/n)')
@ -89,6 +195,3 @@ if os.path.exists(args.input_file + '.bak'):
# backup the original file
os.rename(args.input_file, args.input_file + '.bak')
df.to_json(args.input_file, orient='records', lines=True)
with open(output_md_filepath, 'w') as f:
f.write(output_md)

View File

@ -41,3 +41,6 @@ fi
# Run the command
eval $COMMAND
# update the output with evaluation results
poetry run python evaluation/swe_bench/scripts/eval/update_output_with_eval.py $INPUT_FILE