mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
[eval] improve update output script for swe-bench (#4180)
This commit is contained in:
parent
80a631361b
commit
245334e89d
@ -12,16 +12,22 @@ n_runtimes=$(echo $response | jq -r '.total')
|
||||
echo "Found ${n_runtimes} runtimes. Stopping them..."
|
||||
|
||||
runtime_ids=$(echo $response | jq -r '.runtimes | .[].runtime_id')
|
||||
# Loop through each runtime and stop it
|
||||
counter=1
|
||||
for runtime_id in $runtime_ids; do
|
||||
|
||||
# Function to stop a single runtime
|
||||
stop_runtime() {
|
||||
local runtime_id=$1
|
||||
local counter=$2
|
||||
echo "Stopping runtime ${counter}/${n_runtimes}: ${runtime_id}"
|
||||
curl --silent --location --request POST "${BASE_URL}/stop" \
|
||||
--header "X-API-Key: ${ALLHANDS_API_KEY}" \
|
||||
--header "Content-Type: application/json" \
|
||||
--data-raw "{\"runtime_id\": \"${runtime_id}\"}"
|
||||
echo
|
||||
((counter++))
|
||||
done
|
||||
}
|
||||
export -f stop_runtime
|
||||
export BASE_URL ALLHANDS_API_KEY n_runtimes
|
||||
|
||||
# Use GNU Parallel to stop runtimes in parallel
|
||||
echo "$runtime_ids" | parallel -j 16 --progress stop_runtime {} {#}
|
||||
|
||||
echo "All runtimes have been stopped."
|
||||
|
||||
76
evaluation/swe_bench/scripts/eval/summarize_outputs.py
Executable file
76
evaluation/swe_bench/scripts/eval/summarize_outputs.py
Executable file
@ -0,0 +1,76 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import json
|
||||
from collections import Counter
|
||||
|
||||
ERROR_KEYWORDS = [
|
||||
'Agent encountered an error while processing the last action',
|
||||
'APIError',
|
||||
'Action execution failed',
|
||||
]
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('output_file', type=str, help='The file to summarize')
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.output_file, 'r') as file:
|
||||
lines = file.readlines()
|
||||
|
||||
num_lines = len(lines)
|
||||
num_error_lines = 0
|
||||
num_agent_stuck_in_loop = 0
|
||||
|
||||
num_resolved = 0
|
||||
num_empty_patch = 0
|
||||
|
||||
error_counter = Counter()
|
||||
|
||||
for line in lines:
|
||||
_d = json.loads(line)
|
||||
patch = _d.get('test_result', {}).get('git_patch', '')
|
||||
if patch == '':
|
||||
num_empty_patch += 1
|
||||
continue
|
||||
|
||||
report = _d.get('report', {}) or {}
|
||||
resolved = report.get('resolved', False)
|
||||
if resolved:
|
||||
num_resolved += 1
|
||||
|
||||
error = _d.get('error', None)
|
||||
|
||||
if error is not None and isinstance(error, str):
|
||||
agent_stuck_in_loop = 'Agent got stuck in a loop' in error
|
||||
contains_error = bool(error) and not agent_stuck_in_loop
|
||||
if agent_stuck_in_loop:
|
||||
error_counter['Agent got stuck in a loop'] += 1
|
||||
num_agent_stuck_in_loop += 1
|
||||
elif contains_error:
|
||||
error_counter[error] += 1
|
||||
continue
|
||||
|
||||
for keyword in ERROR_KEYWORDS:
|
||||
if keyword in line:
|
||||
error_counter[keyword] += 1
|
||||
num_error_lines += 1
|
||||
break
|
||||
|
||||
# print the error counter (with percentage)
|
||||
print('-' * 100)
|
||||
print(
|
||||
f'# of resolved: {num_resolved} / {num_lines} ({num_resolved / num_lines * 100:.2f}%)'
|
||||
)
|
||||
print(
|
||||
f'# of empty patch: {num_empty_patch} / {num_lines} ({num_empty_patch / num_lines * 100:.2f}%)'
|
||||
)
|
||||
print(
|
||||
f'# of error lines: {num_error_lines} / {num_lines} ({num_error_lines / num_lines * 100:.2f}%)'
|
||||
)
|
||||
print(
|
||||
f'# of loop: {num_agent_stuck_in_loop} / {num_lines} ({num_agent_stuck_in_loop / num_lines * 100:.2f}%)'
|
||||
)
|
||||
print('-' * 100)
|
||||
print('Detailed error breakdown:')
|
||||
for error, count in error_counter.items():
|
||||
print(f'{error}: {count} ({count / num_lines * 100:.2f}%)')
|
||||
@ -10,16 +10,36 @@ parser.add_argument('input_file', type=str)
|
||||
args = parser.parse_args()
|
||||
|
||||
dirname = os.path.dirname(args.input_file)
|
||||
report_json = os.path.join(dirname, 'report.json')
|
||||
|
||||
df = pd.read_json(args.input_file, lines=True)
|
||||
|
||||
output_md_filepath = os.path.join(dirname, 'README.md')
|
||||
instance_id_to_status = defaultdict(
|
||||
lambda: {'resolved': False, 'empty_generation': False}
|
||||
lambda: {
|
||||
'empty_generation': False,
|
||||
'resolved': False,
|
||||
'failed_apply_patch': False,
|
||||
'error_eval': False,
|
||||
'test_timeout': False,
|
||||
}
|
||||
)
|
||||
if os.path.exists(report_json):
|
||||
with open(report_json, 'r') as f:
|
||||
|
||||
|
||||
# Apply the status to the dataframe
|
||||
def apply_report(row):
|
||||
instance_id = row['instance_id']
|
||||
if instance_id in instance_id_to_status:
|
||||
return dict(instance_id_to_status[instance_id])
|
||||
return row.get('report', {})
|
||||
|
||||
|
||||
swebench_official_report_json = os.path.join(dirname, 'report.json')
|
||||
openhands_remote_report_jsonl = args.input_file.replace(
|
||||
'.jsonl', '.swebench_eval.jsonl'
|
||||
)
|
||||
|
||||
if os.path.exists(swebench_official_report_json):
|
||||
output_md_filepath = os.path.join(dirname, 'README.md')
|
||||
with open(swebench_official_report_json, 'r') as f:
|
||||
report = json.load(f)
|
||||
|
||||
output_md = (
|
||||
@ -70,15 +90,101 @@ if os.path.exists(report_json):
|
||||
f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
|
||||
)
|
||||
|
||||
# Apply the status to the dataframe
|
||||
def apply_report(row):
|
||||
instance_id = row['instance_id']
|
||||
if instance_id in instance_id_to_status:
|
||||
return dict(instance_id_to_status[instance_id])
|
||||
return row.get('report', {})
|
||||
|
||||
df['report'] = df.apply(apply_report, axis=1)
|
||||
|
||||
with open(output_md_filepath, 'w') as f:
|
||||
f.write(output_md)
|
||||
|
||||
elif os.path.exists(openhands_remote_report_jsonl):
|
||||
output_md_filepath = args.input_file.replace('.jsonl', '.swebench_eval.md')
|
||||
|
||||
df_eval = pd.read_json(openhands_remote_report_jsonl, lines=True, orient='records')
|
||||
|
||||
assert len(df['instance_id'].unique()) == len(
|
||||
df
|
||||
), 'There are duplicate instance ids in the original output which is not allowed'
|
||||
assert len(df_eval['instance_id'].unique()) == len(
|
||||
df_eval
|
||||
), 'There are duplicate instance ids in the eval report which is not allowed'
|
||||
|
||||
for _, row in df_eval.iterrows():
|
||||
instance_id_to_status[row['instance_id']] = row['test_result']['report']
|
||||
df['report'] = df.apply(apply_report, axis=1)
|
||||
|
||||
_n_instances = len(df)
|
||||
_n_resolved = len(df[df['report'].apply(lambda x: x.get('resolved', False))])
|
||||
_n_unresolved = _n_instances - _n_resolved
|
||||
_n_empty_patch = len(
|
||||
df[df['report'].apply(lambda x: x.get('empty_generation', False))]
|
||||
)
|
||||
_n_error = len(df[df['report'].apply(lambda x: x.get('error_eval', False))])
|
||||
output_md = (
|
||||
'# SWE-bench Report\n'
|
||||
'This folder contains the evaluation results of the SWE-bench using the [official evaluation docker containerization](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md#choosing-the-right-cache_level).\n\n'
|
||||
'## Summary\n'
|
||||
f'- submitted instances: {_n_instances}\n'
|
||||
f'- empty patch instances: {_n_empty_patch}\n'
|
||||
f'- resolved instances: {_n_resolved}\n'
|
||||
f'- unresolved instances: {_n_unresolved}\n'
|
||||
f'- error instances: {_n_error}\n'
|
||||
)
|
||||
|
||||
def _instance_id_to_log_path(instance_id):
|
||||
path = f"{args.input_file.replace('.jsonl', '.swebench_eval.logs')}/instance_{instance_id}.log"
|
||||
# make it relative path
|
||||
path = os.path.relpath(path, start=dirname)
|
||||
return path
|
||||
|
||||
output_md += '\n## Resolved Instances\n'
|
||||
# instance_id to status
|
||||
for instance_id in sorted(
|
||||
df[df['report'].apply(lambda x: x.get('resolved', False))][
|
||||
'instance_id'
|
||||
].unique()
|
||||
):
|
||||
instance_id_to_status[instance_id]['resolved'] = True
|
||||
output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
|
||||
|
||||
output_md += '\n## Unresolved Instances\n'
|
||||
for instance_id in sorted(
|
||||
df[~df['report'].apply(lambda x: x.get('resolved', False))][
|
||||
'instance_id'
|
||||
].unique()
|
||||
):
|
||||
output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
|
||||
|
||||
output_md += '\n## Error Instances\n'
|
||||
for instance_id in sorted(
|
||||
df[df['report'].apply(lambda x: x.get('error_eval', False))][
|
||||
'instance_id'
|
||||
].unique()
|
||||
):
|
||||
instance_id_to_status[instance_id]['error_eval'] = True
|
||||
output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
|
||||
|
||||
output_md += '\n## Empty Patch Instances\n'
|
||||
for instance_id in sorted(
|
||||
df[df['report'].apply(lambda x: x.get('empty_generation', False))][
|
||||
'instance_id'
|
||||
].unique()
|
||||
):
|
||||
instance_id_to_status[instance_id]['empty_generation'] = True
|
||||
output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
|
||||
|
||||
output_md += '\n## Incomplete Instances\n'
|
||||
for instance_id in sorted(
|
||||
df[df['report'].apply(lambda x: x.get('test_timeout', False))][
|
||||
'instance_id'
|
||||
].unique()
|
||||
):
|
||||
output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
|
||||
with open(output_md_filepath, 'w') as f:
|
||||
f.write(output_md)
|
||||
else:
|
||||
print(
|
||||
f'No report file found: Both {swebench_official_report_json} and {openhands_remote_report_jsonl} do not exist.'
|
||||
)
|
||||
exit()
|
||||
|
||||
if os.path.exists(args.input_file + '.bak'):
|
||||
conf = input('Existing backup file found. Do you want to overwrite it? (y/n)')
|
||||
@ -89,6 +195,3 @@ if os.path.exists(args.input_file + '.bak'):
|
||||
# backup the original file
|
||||
os.rename(args.input_file, args.input_file + '.bak')
|
||||
df.to_json(args.input_file, orient='records', lines=True)
|
||||
|
||||
with open(output_md_filepath, 'w') as f:
|
||||
f.write(output_md)
|
||||
|
||||
@ -41,3 +41,6 @@ fi
|
||||
|
||||
# Run the command
|
||||
eval $COMMAND
|
||||
|
||||
# update the output with evaluation results
|
||||
poetry run python evaluation/swe_bench/scripts/eval/update_output_with_eval.py $INPUT_FILE
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user