OpenHands/evaluation/benchmarks/testgeneval/scripts/eval/summarize_outputs.py

#!/usr/bin/env python3
import argparse
import json
from collections import Counter

from openhands.events.serialization import event_from_dict
from openhands.events.utils import get_pairs_from_events

ERROR_KEYWORDS = [
    'Agent encountered an error while processing the last action',
    'APIError',
    'Action execution failed',
]

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('output_file', type=str, help='The file to summarize')
    args = parser.parse_args()

    with open(args.output_file, 'r') as file:
        lines = file.readlines()

    num_lines = len(lines)
    num_error_lines = 0
    num_agent_stuck_in_loop = 0

    coverage = 0
    mutation_score = 0
    num_empty_suite = 0

    error_counter = Counter()

    main_agent_cost = []
    editor_cost = []
    num_turns = []

    for line in lines:
        _d = json.loads(line)

        # Cost
        costs = _d['metrics'].get('costs', [])
        _cur_main_agent_cost = 0
        _cur_editor_cost = 0
        for cost in costs:
            if isinstance(cost, float):
                # backward compatible
                _cur_main_agent_cost += cost
            else:
                if 'draft_editor' in cost['model']:
                    _cur_editor_cost += cost['cost']
                else:
                    _cur_main_agent_cost += cost['cost']

        main_agent_cost.append(_cur_main_agent_cost)
        editor_cost.append(_cur_editor_cost)

        # Turn status
        history = _d.get('history', [])
        events = [event_from_dict(event) for event in history]
        pairs = get_pairs_from_events(events)
        num_turns.append(len(pairs))

        # Suite & resolve status
        suite = _d.get('test_result', {}).get('test_suite', '')
        if suite == '':
            num_empty_suite += 1
            continue

        report = _d.get('report', {}) or {}
        coverage += report.get('coverage', 0)
        mutation_score += report.get('mutation_score', 0)

        # Error
        error = _d.get('error', None)

        if error is not None and isinstance(error, str):
            agent_stuck_in_loop = 'Agent got stuck in a loop' in error
            contains_error = bool(error) and not agent_stuck_in_loop
            if agent_stuck_in_loop:
                error_counter['Agent got stuck in a loop'] += 1
                num_agent_stuck_in_loop += 1
            elif contains_error:
                error_counter[error] += 1
            continue

        for keyword in ERROR_KEYWORDS:
            if keyword in line:
                error_counter[keyword] += 1
                num_error_lines += 1
                break

    # print the error counter (with percentage)
    print(f'Average coverage for {num_lines} ({coverage / num_lines * 100:.2f}%)')
    print(
        f'Average mutation score for {num_lines} ({mutation_score / num_lines * 100:.2f}%)'
    )

    print(
        f'Number of empty suite: {num_empty_suite} / {num_lines} ({num_empty_suite / num_lines * 100:.2f}%)'
    )
    print(
        f'Number of error lines: {num_error_lines} / {num_lines} ({num_error_lines / num_lines * 100:.2f}%)'
    )
    print(
        f'Number of agent stuck in loop: {num_agent_stuck_in_loop} / {num_lines} ({num_agent_stuck_in_loop / num_lines * 100:.2f}%)'
    )
    assert len(num_turns) == num_lines
    assert len(main_agent_cost) == num_lines
    assert len(editor_cost) == num_lines
    print('## Statistics')
    print(f'Avg. num of turns per instance: {sum(num_turns) / num_lines:.2f}')
    print(f'Avg. agent cost per instance: {sum(main_agent_cost) / num_lines:.2f} USD')
    print(f'Avg. editor cost per instance: {sum(editor_cost) / num_lines:.2f} USD')
    print(
        f'Avg. total cost per instance: {(sum(main_agent_cost) + sum(editor_cost)) / num_lines:.2f} USD'
    )

    print('## Detailed error breakdown:')
    for error, count in error_counter.items():
        print(f'{error}: {count} ({count / num_lines * 100:.2f}%)')