[eval] improve update output script for swe-bench (#4180)

2025-12-26 05:48:36 +08:00 · 2024-10-04 10:10:03 -05:00 · 2024-10-04 10:10:03 -05:00 · 245334e89d
commit 245334e89d
parent 80a631361b
4 changed files with 208 additions and 20 deletions
--- a/evaluation/swe_bench/scripts/cleanup_remote_runtime.sh
+++ b/evaluation/swe_bench/scripts/cleanup_remote_runtime.sh
@ -12,16 +12,22 @@ n_runtimes=$(echo $response | jq -r '.total')
 echo "Found ${n_runtimes} runtimes. Stopping them..."

 runtime_ids=$(echo $response | jq -r '.runtimes | .[].runtime_id')
-# Loop through each runtime and stop it
-counter=1
-for runtime_id in $runtime_ids; do
+
+# Function to stop a single runtime
+stop_runtime() {
+  local runtime_id=$1
+  local counter=$2
  echo "Stopping runtime ${counter}/${n_runtimes}: ${runtime_id}"
  curl --silent --location --request POST "${BASE_URL}/stop" \
    --header "X-API-Key: ${ALLHANDS_API_KEY}" \
    --header "Content-Type: application/json" \
    --data-raw "{\"runtime_id\": \"${runtime_id}\"}"
  echo
-  ((counter++))
-done
+}
+export -f stop_runtime
+export BASE_URL ALLHANDS_API_KEY n_runtimes
+
+# Use GNU Parallel to stop runtimes in parallel
+echo "$runtime_ids" | parallel -j 16 --progress stop_runtime {} {#}

 echo "All runtimes have been stopped."
--- a/evaluation/swe_bench/scripts/eval/summarize_outputs.py
+++ b/evaluation/swe_bench/scripts/eval/summarize_outputs.py
@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+import argparse
+import json
+from collections import Counter
+
+ERROR_KEYWORDS = [
+    'Agent encountered an error while processing the last action',
+    'APIError',
+    'Action execution failed',
+]
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('output_file', type=str, help='The file to summarize')
+    args = parser.parse_args()
+
+    with open(args.output_file, 'r') as file:
+        lines = file.readlines()
+
+    num_lines = len(lines)
+    num_error_lines = 0
+    num_agent_stuck_in_loop = 0
+
+    num_resolved = 0
+    num_empty_patch = 0
+
+    error_counter = Counter()
+
+    for line in lines:
+        _d = json.loads(line)
+        patch = _d.get('test_result', {}).get('git_patch', '')
+        if patch == '':
+            num_empty_patch += 1
+            continue
+
+        report = _d.get('report', {}) or {}
+        resolved = report.get('resolved', False)
+        if resolved:
+            num_resolved += 1
+
+        error = _d.get('error', None)
+
+        if error is not None and isinstance(error, str):
+            agent_stuck_in_loop = 'Agent got stuck in a loop' in error
+            contains_error = bool(error) and not agent_stuck_in_loop
+            if agent_stuck_in_loop:
+                error_counter['Agent got stuck in a loop'] += 1
+                num_agent_stuck_in_loop += 1
+            elif contains_error:
+                error_counter[error] += 1
+            continue
+
+        for keyword in ERROR_KEYWORDS:
+            if keyword in line:
+                error_counter[keyword] += 1
+                num_error_lines += 1
+                break
+
+    # print the error counter (with percentage)
+    print('-' * 100)
+    print(
+        f'# of resolved: {num_resolved} / {num_lines} ({num_resolved / num_lines * 100:.2f}%)'
+    )
+    print(
+        f'# of empty patch: {num_empty_patch} / {num_lines} ({num_empty_patch / num_lines * 100:.2f}%)'
+    )
+    print(
+        f'# of error lines: {num_error_lines} / {num_lines} ({num_error_lines / num_lines * 100:.2f}%)'
+    )
+    print(
+        f'# of loop: {num_agent_stuck_in_loop} / {num_lines} ({num_agent_stuck_in_loop / num_lines * 100:.2f}%)'
+    )
+    print('-' * 100)
+    print('Detailed error breakdown:')
+    for error, count in error_counter.items():
+        print(f'{error}: {count} ({count / num_lines * 100:.2f}%)')
--- a/evaluation/swe_bench/scripts/eval/update_output_with_eval.py
+++ b/evaluation/swe_bench/scripts/eval/update_output_with_eval.py
@ -10,16 +10,36 @@ parser.add_argument('input_file', type=str)
 args = parser.parse_args()

 dirname = os.path.dirname(args.input_file)
-report_json = os.path.join(dirname, 'report.json')

 df = pd.read_json(args.input_file, lines=True)

-output_md_filepath = os.path.join(dirname, 'README.md')
 instance_id_to_status = defaultdict(
-    lambda: {'resolved': False, 'empty_generation': False}
+    lambda: {
+        'empty_generation': False,
+        'resolved': False,
+        'failed_apply_patch': False,
+        'error_eval': False,
+        'test_timeout': False,
+    }
 )
-if os.path.exists(report_json):
-    with open(report_json, 'r') as f:
+
+
+# Apply the status to the dataframe
+def apply_report(row):
+    instance_id = row['instance_id']
+    if instance_id in instance_id_to_status:
+        return dict(instance_id_to_status[instance_id])
+    return row.get('report', {})
+
+
+swebench_official_report_json = os.path.join(dirname, 'report.json')
+openhands_remote_report_jsonl = args.input_file.replace(
+    '.jsonl', '.swebench_eval.jsonl'
+)
+
+if os.path.exists(swebench_official_report_json):
+    output_md_filepath = os.path.join(dirname, 'README.md')
+    with open(swebench_official_report_json, 'r') as f:
        report = json.load(f)

    output_md = (
@ -70,15 +90,101 @@ if os.path.exists(report_json):
            f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
        )

-    # Apply the status to the dataframe
-    def apply_report(row):
-        instance_id = row['instance_id']
-        if instance_id in instance_id_to_status:
-            return dict(instance_id_to_status[instance_id])
-        return row.get('report', {})
-
    df['report'] = df.apply(apply_report, axis=1)

+    with open(output_md_filepath, 'w') as f:
+        f.write(output_md)
+
+elif os.path.exists(openhands_remote_report_jsonl):
+    output_md_filepath = args.input_file.replace('.jsonl', '.swebench_eval.md')
+
+    df_eval = pd.read_json(openhands_remote_report_jsonl, lines=True, orient='records')
+
+    assert len(df['instance_id'].unique()) == len(
+        df
+    ), 'There are duplicate instance ids in the original output which is not allowed'
+    assert len(df_eval['instance_id'].unique()) == len(
+        df_eval
+    ), 'There are duplicate instance ids in the eval report which is not allowed'
+
+    for _, row in df_eval.iterrows():
+        instance_id_to_status[row['instance_id']] = row['test_result']['report']
+    df['report'] = df.apply(apply_report, axis=1)
+
+    _n_instances = len(df)
+    _n_resolved = len(df[df['report'].apply(lambda x: x.get('resolved', False))])
+    _n_unresolved = _n_instances - _n_resolved
+    _n_empty_patch = len(
+        df[df['report'].apply(lambda x: x.get('empty_generation', False))]
+    )
+    _n_error = len(df[df['report'].apply(lambda x: x.get('error_eval', False))])
+    output_md = (
+        '# SWE-bench Report\n'
+        'This folder contains the evaluation results of the SWE-bench using the [official evaluation docker containerization](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md#choosing-the-right-cache_level).\n\n'
+        '## Summary\n'
+        f'- submitted instances: {_n_instances}\n'
+        f'- empty patch instances: {_n_empty_patch}\n'
+        f'- resolved instances: {_n_resolved}\n'
+        f'- unresolved instances: {_n_unresolved}\n'
+        f'- error instances: {_n_error}\n'
+    )
+
+    def _instance_id_to_log_path(instance_id):
+        path = f"{args.input_file.replace('.jsonl', '.swebench_eval.logs')}/instance_{instance_id}.log"
+        # make it relative path
+        path = os.path.relpath(path, start=dirname)
+        return path
+
+    output_md += '\n## Resolved Instances\n'
+    # instance_id to status
+    for instance_id in sorted(
+        df[df['report'].apply(lambda x: x.get('resolved', False))][
+            'instance_id'
+        ].unique()
+    ):
+        instance_id_to_status[instance_id]['resolved'] = True
+        output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
+
+    output_md += '\n## Unresolved Instances\n'
+    for instance_id in sorted(
+        df[~df['report'].apply(lambda x: x.get('resolved', False))][
+            'instance_id'
+        ].unique()
+    ):
+        output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
+
+    output_md += '\n## Error Instances\n'
+    for instance_id in sorted(
+        df[df['report'].apply(lambda x: x.get('error_eval', False))][
+            'instance_id'
+        ].unique()
+    ):
+        instance_id_to_status[instance_id]['error_eval'] = True
+        output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
+
+    output_md += '\n## Empty Patch Instances\n'
+    for instance_id in sorted(
+        df[df['report'].apply(lambda x: x.get('empty_generation', False))][
+            'instance_id'
+        ].unique()
+    ):
+        instance_id_to_status[instance_id]['empty_generation'] = True
+        output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
+
+    output_md += '\n## Incomplete Instances\n'
+    for instance_id in sorted(
+        df[df['report'].apply(lambda x: x.get('test_timeout', False))][
+            'instance_id'
+        ].unique()
+    ):
+        output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
+    with open(output_md_filepath, 'w') as f:
+        f.write(output_md)
+else:
+    print(
+        f'No report file found: Both {swebench_official_report_json} and {openhands_remote_report_jsonl} do not exist.'
+    )
+    exit()

 if os.path.exists(args.input_file + '.bak'):
    conf = input('Existing backup file found. Do you want to overwrite it? (y/n)')
@ -89,6 +195,3 @@ if os.path.exists(args.input_file + '.bak'):
 # backup the original file
 os.rename(args.input_file, args.input_file + '.bak')
 df.to_json(args.input_file, orient='records', lines=True)
-
-with open(output_md_filepath, 'w') as f:
-    f.write(output_md)
--- a/evaluation/swe_bench/scripts/eval_infer_remote.sh
+++ b/evaluation/swe_bench/scripts/eval_infer_remote.sh
@ -41,3 +41,6 @@ fi

 # Run the command
 eval $COMMAND
+
+# update the output with evaluation results
+poetry run python evaluation/swe_bench/scripts/eval/update_output_with_eval.py $INPUT_FILE