misc: Support folder-level exp analysis for SWE-Bench summarize_outputs.py; Handle CrashLoopBackoff for RemoteRuntime (#5385)

2025-12-26 05:48:36 +08:00 · 2024-12-03 09:37:21 -06:00 · 2024-12-03 09:37:21 -06:00 · 990f277132
commit 990f277132
parent 2f11634cca
4 changed files with 196 additions and 41 deletions
--- a/evaluation/benchmarks/swe_bench/scripts/eval/summarize_outputs.py
+++ b/evaluation/benchmarks/swe_bench/scripts/eval/summarize_outputs.py
@ -1,8 +1,12 @@
 #!/usr/bin/env python3
 import argparse
+import glob
 import json
+import os
 from collections import Counter

+import pandas as pd
+
 from openhands.events.serialization import event_from_dict
 from openhands.events.utils import get_pairs_from_events

@ -10,25 +14,21 @@ ERROR_KEYWORDS = [
    'Agent encountered an error while processing the last action',
    'APIError',
    'Action execution failed',
+    'litellm.Timeout: APITimeoutError',
 ]

-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('output_file', type=str, help='The file to summarize')
-    args = parser.parse_args()

-    with open(args.output_file, 'r') as file:
+def process_file(file_path):
+    with open(file_path, 'r') as file:
        lines = file.readlines()

    num_lines = len(lines)
    num_error_lines = 0
    num_agent_stuck_in_loop = 0
-
    num_resolved = 0
    num_empty_patch = 0
-
+    num_unfinished_runs = 0
    error_counter = Counter()
-
    main_agent_cost = []
    editor_cost = []
    num_turns = []
@ -36,6 +36,11 @@ if __name__ == '__main__':
    for line in lines:
        _d = json.loads(line)

+        if 'metrics' not in _d or _d['metrics'] is None:
+            # this is a failed run
+            num_unfinished_runs += 1
+            continue
+
        # Cost
        costs = _d['metrics'].get('costs', [])
        _cur_main_agent_cost = 0
@ -89,30 +94,180 @@ if __name__ == '__main__':
                num_error_lines += 1
                break

-    # print the error counter (with percentage)
-    print(
-        f'Number of resolved: {num_resolved} / {num_lines} ({num_resolved / num_lines * 100:.2f}%)'
-    )
-    print(
-        f'Number of empty patch: {num_empty_patch} / {num_lines} ({num_empty_patch / num_lines * 100:.2f}%)'
-    )
-    print(
-        f'Number of error lines: {num_error_lines} / {num_lines} ({num_error_lines / num_lines * 100:.2f}%)'
-    )
-    print(
-        f'Number of agent stuck in loop: {num_agent_stuck_in_loop} / {num_lines} ({num_agent_stuck_in_loop / num_lines * 100:.2f}%)'
-    )
-    assert len(num_turns) == num_lines
-    assert len(main_agent_cost) == num_lines
-    assert len(editor_cost) == num_lines
-    print('## Statistics')
-    print(f'Avg. num of turns per instance: {sum(num_turns) / num_lines:.2f}')
-    print(f'Avg. agent cost per instance: {sum(main_agent_cost) / num_lines:.2f} USD')
-    print(f'Avg. editor cost per instance: {sum(editor_cost) / num_lines:.2f} USD')
-    print(
-        f'Avg. total cost per instance: {(sum(main_agent_cost) + sum(editor_cost)) / num_lines:.2f} USD'
+    return {
+        'file_path': file_path,
+        'total_instances': num_lines,
+        'resolved': {
+            'count': num_resolved,
+            'percentage': (num_resolved / num_lines * 100) if num_lines > 0 else 0,
+        },
+        'empty_patches': {
+            'count': num_empty_patch,
+            'percentage': (num_empty_patch / num_lines * 100) if num_lines > 0 else 0,
+        },
+        'unfinished_runs': {
+            'count': num_unfinished_runs,
+            'percentage': (num_unfinished_runs / num_lines * 100)
+            if num_lines > 0
+            else 0,
+        },
+        'errors': {
+            'total': num_error_lines,
+            'percentage': (num_error_lines / num_lines * 100) if num_lines > 0 else 0,
+            'stuck_in_loop': {
+                'count': num_agent_stuck_in_loop,
+                'percentage': (num_agent_stuck_in_loop / num_lines * 100)
+                if num_lines > 0
+                else 0,
+            },
+            'breakdown': {
+                str(error): {
+                    'count': count,
+                    'percentage': (count / num_lines * 100) if num_lines > 0 else 0,
+                }
+                for error, count in error_counter.items()
+            },
+        },
+        'statistics': {
+            'avg_turns': sum(num_turns) / num_lines if num_lines > 0 else 0,
+            'costs': {
+                'main_agent': sum(main_agent_cost) / num_lines if num_lines > 0 else 0,
+                'editor': sum(editor_cost) / num_lines if num_lines > 0 else 0,
+                'total': (sum(main_agent_cost) + sum(editor_cost)) / num_lines
+                if num_lines > 0
+                else 0,
+            },
+        },
+    }
+
+
+def aggregate_directory(input_path) -> pd.DataFrame:
+    # Process all output.jsonl files in subdirectories
+    pattern = os.path.join(input_path, '**/output.jsonl')
+    files = glob.glob(pattern, recursive=True)
+    print(f'Processing {len(files)} files from directory {input_path}')
+
+    # Process each file silently and collect results
+    results = []
+    for file_path in files:
+        try:
+            result = process_file(file_path)
+            results.append(result)
+        except Exception as e:
+            print(f'Error processing {file_path}: {str(e)}')
+            import traceback
+
+            traceback.print_exc()
+            continue
+
+    # Convert results to pandas DataFrame and sort by resolve rate
+    df = pd.DataFrame(results)
+
+    # Extract directory name from file path
+    df['directory'] = df['file_path'].apply(
+        lambda x: os.path.basename(os.path.dirname(x))
    )

-    print('## Detailed error breakdown:')
-    for error, count in error_counter.items():
-        print(f'{error}: {count} ({count / num_lines * 100:.2f}%)')
+    df['resolve_rate'] = df['resolved'].apply(lambda x: x['percentage'])
+    df['empty_patch_rate'] = df['empty_patches'].apply(lambda x: x['percentage'])
+    df['unfinished_rate'] = df['unfinished_runs'].apply(lambda x: x['percentage'])
+    df['avg_turns'] = df['statistics'].apply(lambda x: x['avg_turns'])
+    df['error_rate'] = df['errors'].apply(lambda x: x['percentage'])
+    df['avg_cost'] = df['statistics'].apply(lambda x: x['costs']['total'])
+
+    df = df.sort_values('resolve_rate', ascending=False)
+
+    return df
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        'input_path', type=str, help='The file or directory to summarize'
+    )
+    parser.add_argument(
+        '--output',
+        type=str,
+        help='Output JSONL file for results',
+        default='summary_results.jsonl',
+    )
+    args = parser.parse_args()
+
+    if os.path.isdir(args.input_path):
+        df = aggregate_directory(args.input_path)
+        # Create the summary string
+        columns = [
+            'directory',
+            'resolve_rate',
+            'empty_patch_rate',
+            'unfinished_rate',
+            'error_rate',
+            'avg_turns',
+            'avg_cost',
+            'total_instances',
+        ]
+        summary_str = df[columns].to_string(
+            float_format=lambda x: '{:.2f}'.format(x),
+            formatters={
+                'directory': lambda x: x[:90]
+            },  # Truncate directory names to 20 chars
+            index=False,
+        )
+
+        # Print to console
+        print('\nResults summary (sorted by resolve rate):')
+        print(summary_str)
+
+        # Save to text file
+        txt_output = args.output.rsplit('.', 1)[0] + '.txt'
+        with open(txt_output, 'w') as f:
+            f.write('Results summary (sorted by resolve rate):\n')
+            f.write(summary_str)
+
+        # Save
+        df.to_json(args.output, lines=True, orient='records')
+        df[columns].to_csv(args.output.rsplit('.', 1)[0] + '.csv', index=False)
+    else:
+        # Process single file with detailed output
+        results = []
+        try:
+            result = process_file(args.input_path)
+            results.append(result)
+
+            # Print detailed results for single file
+            print(f'\nResults for {args.input_path}:')
+            print(
+                f"Number of resolved: {result['resolved']['count']} / {result['total_instances']} ({result['resolved']['percentage']:.2f}%)"
+            )
+            print(
+                f"Number of empty patch: {result['empty_patches']['count']} / {result['total_instances']} ({result['empty_patches']['percentage']:.2f}%)"
+            )
+            print(
+                f"Number of error lines: {result['errors']['total']} / {result['total_instances']} ({result['errors']['percentage']:.2f}%)"
+            )
+            print(
+                f"Number of agent stuck in loop: {result['errors']['stuck_in_loop']['count']} / {result['total_instances']} ({result['errors']['stuck_in_loop']['percentage']:.2f}%)"
+            )
+            print(
+                f"Number of unfinished runs: {result['unfinished_runs']['count']} / {result['total_instances']} ({result['unfinished_runs']['percentage']:.2f}%)"
+            )
+            print('## Statistics')
+            print(
+                f"Avg. num of turns per instance: {result['statistics']['avg_turns']:.2f}"
+            )
+            print(
+                f"Avg. agent cost per instance: {result['statistics']['costs']['main_agent']:.2f} USD"
+            )
+            print(
+                f"Avg. editor cost per instance: {result['statistics']['costs']['editor']:.2f} USD"
+            )
+            print(
+                f"Avg. total cost per instance: {result['statistics']['costs']['total']:.2f} USD"
+            )
+
+            print('## Detailed error breakdown:')
+            for error, data in result['errors']['breakdown'].items():
+                print(f"{error}: {data['count']} ({data['percentage']:.2f}%)")
+
+        except Exception as e:
+            print(f'Error processing {args.input_path}: {str(e)}')
--- a/openhands/llm/fn_call_converter.py
+++ b/openhands/llm/fn_call_converter.py
@ -431,7 +431,7 @@ def convert_fncall_messages_to_non_fncall_messages(
                    tool_content = convert_tool_call_to_string(message['tool_calls'][0])
                except FunctionCallConversionError as e:
                    raise FunctionCallConversionError(
-                        f'Failed to convert tool call to string. Raw messages: {json.dumps(messages, indent=2)}'
+                        f'Failed to convert tool call to string.\nCurrent tool call: {message["tool_calls"][0]}.\nRaw messages: {json.dumps(messages, indent=2)}'
                    ) from e
                if isinstance(content, str):
                    content += '\n\n' + tool_content
--- a/openhands/runtime/impl/remote/remote_runtime.py
+++ b/openhands/runtime/impl/remote/remote_runtime.py
@ -336,13 +336,13 @@ class RemoteRuntime(Runtime):
        assert 'runtime_id' in runtime_data
        assert runtime_data['runtime_id'] == self.runtime_id
        assert 'pod_status' in runtime_data
-        pod_status = runtime_data['pod_status']
+        pod_status = runtime_data['pod_status'].lower()
        self.log('debug', f'Pod status: {pod_status}')

        # FIXME: We should fix it at the backend of /start endpoint, make sure
        # the pod is created before returning the response.
        # Retry a period of time to give the cluster time to start the pod
-        if pod_status == 'Ready':
+        if pod_status == 'ready':
            try:
                with self._send_request(
                    'GET',
@ -358,14 +358,14 @@ class RemoteRuntime(Runtime):
                )
            return
        elif (
-            pod_status == 'Not Found'
-            or pod_status == 'Pending'
-            or pod_status == 'Running'
+            pod_status == 'not found'
+            or pod_status == 'pending'
+            or pod_status == 'running'
        ):  # nb: Running is not yet Ready
            raise RuntimeNotReadyError(
                f'Runtime (ID={self.runtime_id}) is not yet ready. Status: {pod_status}'
            )
-        elif pod_status in ('Failed', 'Unknown'):
+        elif pod_status in ('failed', 'unknown', 'crashloopbackoff'):
            # clean up the runtime
            self.close()
            raise RuntimeError(
--- a/openhands/server/session/manager.py
+++ b/openhands/server/session/manager.py
@ -63,7 +63,7 @@ class SessionManager:
                    await self._process_message(message)
            except asyncio.CancelledError:
                return
-            except:
+            except Exception:
                try:
                    asyncio.get_running_loop()
                    logger.warning(