mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
Co-authored-by: Tim O'Farrell <tofarr@gmail.com> Co-authored-by: Engel Nyst <enyst@users.noreply.github.com> Co-authored-by: Robert Brennan <accounts@rbren.io> Co-authored-by: Graham Neubig <neubig@gmail.com>
120 lines
3.8 KiB
Python
Executable File
120 lines
3.8 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
import argparse
|
|
import json
|
|
from collections import Counter
|
|
|
|
from openhands.events.serialization import event_from_dict
|
|
from openhands.events.utils import get_pairs_from_events
|
|
|
|
ERROR_KEYWORDS = [
|
|
'Agent encountered an error while processing the last action',
|
|
'APIError',
|
|
'Action execution failed',
|
|
]
|
|
|
|
if __name__ == '__main__':
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('output_file', type=str, help='The file to summarize')
|
|
args = parser.parse_args()
|
|
|
|
with open(args.output_file, 'r') as file:
|
|
lines = file.readlines()
|
|
|
|
num_lines = len(lines)
|
|
num_error_lines = 0
|
|
num_agent_stuck_in_loop = 0
|
|
|
|
num_resolved = 0
|
|
num_empty_patch = 0
|
|
|
|
error_counter = Counter()
|
|
|
|
main_agent_cost = []
|
|
editor_cost = []
|
|
num_turns = []
|
|
|
|
for line in lines:
|
|
_d = json.loads(line)
|
|
|
|
# Cost
|
|
costs = _d['metrics'].get('costs', [])
|
|
_cur_main_agent_cost = 0
|
|
_cur_editor_cost = 0
|
|
for cost in costs:
|
|
if isinstance(cost, float):
|
|
# backward compatible
|
|
_cur_main_agent_cost += cost
|
|
else:
|
|
if 'draft_editor' in cost['model']:
|
|
_cur_editor_cost += cost['cost']
|
|
else:
|
|
_cur_main_agent_cost += cost['cost']
|
|
|
|
main_agent_cost.append(_cur_main_agent_cost)
|
|
editor_cost.append(_cur_editor_cost)
|
|
|
|
# Turn status
|
|
history = _d.get('history', [])
|
|
events = [event_from_dict(event) for event in history]
|
|
pairs = get_pairs_from_events(events)
|
|
num_turns.append(len(pairs))
|
|
|
|
# Patch & resolve status
|
|
patch = _d.get('test_result', {}).get('git_patch', '')
|
|
if patch == '':
|
|
num_empty_patch += 1
|
|
continue
|
|
|
|
report = _d.get('report', {}) or {}
|
|
resolved = report.get('resolved', False)
|
|
if resolved:
|
|
num_resolved += 1
|
|
|
|
# Error
|
|
error = _d.get('error', None)
|
|
|
|
if error is not None and isinstance(error, str):
|
|
agent_stuck_in_loop = 'Agent got stuck in a loop' in error
|
|
contains_error = bool(error) and not agent_stuck_in_loop
|
|
if agent_stuck_in_loop:
|
|
error_counter['Agent got stuck in a loop'] += 1
|
|
num_agent_stuck_in_loop += 1
|
|
elif contains_error:
|
|
error_counter[error] += 1
|
|
continue
|
|
|
|
for keyword in ERROR_KEYWORDS:
|
|
if keyword in line:
|
|
error_counter[keyword] += 1
|
|
num_error_lines += 1
|
|
break
|
|
|
|
# print the error counter (with percentage)
|
|
print('-' * 100)
|
|
print(
|
|
f'# of resolved: {num_resolved} / {num_lines} ({num_resolved / num_lines * 100:.2f}%)'
|
|
)
|
|
print(
|
|
f'# of empty patch: {num_empty_patch} / {num_lines} ({num_empty_patch / num_lines * 100:.2f}%)'
|
|
)
|
|
print(
|
|
f'# of error lines: {num_error_lines} / {num_lines} ({num_error_lines / num_lines * 100:.2f}%)'
|
|
)
|
|
print(
|
|
f'# of loop: {num_agent_stuck_in_loop} / {num_lines} ({num_agent_stuck_in_loop / num_lines * 100:.2f}%)'
|
|
)
|
|
assert len(num_turns) == num_lines
|
|
assert len(main_agent_cost) == num_lines
|
|
assert len(editor_cost) == num_lines
|
|
print(f'Avg. num of turns per instance: {sum(num_turns) / num_lines:.2f}')
|
|
print(f'Avg. agent cost per instance: {sum(main_agent_cost) / num_lines:.2f} USD')
|
|
print(f'Avg. editor cost per instance: {sum(editor_cost) / num_lines:.2f} USD')
|
|
print(
|
|
f'Avg. total cost per instance: {(sum(main_agent_cost) + sum(editor_cost)) / num_lines:.2f} USD'
|
|
)
|
|
print('-' * 100)
|
|
print('Detailed error breakdown:')
|
|
for error, count in error_counter.items():
|
|
print(f'{error}: {count} ({count / num_lines * 100:.2f}%)')
|
|
print('-' * 100)
|