[Evaluation] Add summarise_results script for TheAgentCompany benchmark (#5811)

2026-03-22 13:47:19 +08:00 · 2024-12-27 20:33:41 -08:00
parent 157ff4a4b9
commit 6a4442e590
1 changed files with 316 additions and 0 deletions
--- a/evaluation/benchmarks/the_agent_company/scripts/summarise_results.py
+++ b/evaluation/benchmarks/the_agent_company/scripts/summarise_results.py
@@ -0,0 +1,316 @@
+###########################################################################################################
+# Adapted from https://github.com/TheAgentCompany/TheAgentCompany/blob/main/evaluation/summarise_results.py
+###########################################################################################################
+
+
+import glob
+import json
+import os
+import re
+import sys
+from typing import Dict, Tuple
+
+
+def calculate_cost(model: str, prompt_tokens: int, completion_tokens: int) -> float:
+    """
+    Calculate the cost of the model call.
+    """
+    if 'claude-3-5-sonnet' in model.lower():
+        # https://www.anthropic.com/pricing#anthropic-api, accessed 12/11/2024
+        return 0.000003 * prompt_tokens + 0.000015 * completion_tokens
+    elif 'gpt-4o' in model.lower():
+        # https://openai.com/api/pricing/, accessed 12/11/2024
+        return 0.0000025 * prompt_tokens + 0.00001 * completion_tokens
+    elif 'gemini-1.5-pro' in model.lower():
+        # https://ai.google.dev/pricing#1_5pro, accessed 12/11/2024
+        # assuming prompts up to 128k tokens
+        cost = 0.00000125 * prompt_tokens + 0.000005 * completion_tokens
+        if prompt_tokens > 128000:
+            cost *= 2
+        return cost
+    elif 'gemini-2.0-flash-exp' in model.lower():
+        # price unknown for gemini-2.0-flash-exp, assuming same price as gemini-1.5-flash
+        cost = 0.000000075 * prompt_tokens + 0.0000003 * completion_tokens
+        if prompt_tokens > 128000:
+            cost *= 2
+        return cost
+    elif 'qwen2-72b' in model.lower():
+        # assuming hosted on Together
+        # https://www.together.ai/pricing, accessed 12/11/2024
+        return 0.0000009 * (prompt_tokens + completion_tokens)
+    elif 'qwen2p5-72b' in model.lower():
+        # assuming hosted on Together
+        # https://www.together.ai/pricing, accessed 12/14/2024
+        return 0.0000012 * (prompt_tokens + completion_tokens)
+    elif 'llama-v3p1-405b-instruct' in model.lower():
+        # assuming hosted on Fireworks AI
+        # https://fireworks.ai/pricing, accessed 12/11/2024
+        return 0.000003 * (prompt_tokens + completion_tokens)
+    elif 'llama-v3p1-70b-instruct' in model.lower():
+        # assuming hosted on Fireworks AI
+        return 0.0000009 * (prompt_tokens + completion_tokens)
+    elif 'llama-v3p3-70b-instruct' in model.lower():
+        # assuming hosted on Fireworks AI
+        return 0.0000009 * (prompt_tokens + completion_tokens)
+    elif 'amazon.nova-pro-v1:0' in model.lower():
+        # assuming hosted on Amazon Bedrock
+        # https://aws.amazon.com/bedrock/pricing/, accessed 12/11/2024
+        return 0.0000008 * prompt_tokens + 0.0000032 * completion_tokens
+    else:
+        raise ValueError(f'Unknown model: {model}')
+
+
+def analyze_eval_json_file(filepath: str) -> Tuple[int, int]:
+    """
+    Analyze a single eval JSON file and extract the total and result from final_score.
+
+    Args:
+        filepath: Path to the JSON file
+
+    Returns:
+        Tuple containing (total, result) from final_score
+    """
+    try:
+        with open(filepath, 'r') as f:
+            data = json.load(f)
+
+        final_score = data.get('final_score', {})
+        return (final_score.get('total', 0), final_score.get('result', 0))
+    except json.JSONDecodeError as e:
+        print(f'Error decoding JSON in {filepath}: {e}')
+        return (0, 0)
+    except Exception as e:
+        print(f'Error processing {filepath}: {e}')
+        return (0, 0)
+
+
+def analyze_traj_json_file(filepath: str) -> Tuple[int, float]:
+    """
+    Analyze a single trajectory JSON file and extract the steps and tokens
+    for each step. Then estimate the cost based on the tokens and the model type.
+    Note: this is assuming there's no prompt caching at all.
+    """
+    steps: int = 0
+    cost: float = 0.0
+    with open(filepath, 'r') as f:
+        data = json.load(f)
+        response_id = None
+        for action in data:
+            if 'tool_call_metadata' in action:
+                if action['tool_call_metadata']['model_response']['id'] != response_id:
+                    response_id = action['tool_call_metadata']['model_response']['id']
+                else:
+                    # openhands displays the same model response meta data multiple times, when
+                    # a single LLM call leads to multiple actions and observations.
+                    continue
+                steps += 1
+                usage = action['tool_call_metadata']['model_response']['usage']
+                model: str = action['tool_call_metadata']['model_response']['model']
+                prompt_tokens = usage['prompt_tokens']
+                completion_tokens = usage['completion_tokens']
+                cost += calculate_cost(model, prompt_tokens, completion_tokens)
+
+    return (steps, cost)
+
+
+def analyze_folder(
+    folder_path: str,
+) -> Tuple[Dict[str, Tuple[int, int]], Dict[str, Tuple[int, float]]]:
+    """
+    Analyze all eval_*.json & traj_*.json files in the specified folder.
+
+    Args:
+        folder_path: Path to the folder containing JSON files
+
+    Returns:
+        dictionaries:
+        - eval_results: Dictionary with filename as key and (total, result) tuple as value
+        - traj_results: Dictionary with filename as key and (steps, cost) tuple as value
+    """
+    eval_results = {}
+    traj_results = {}
+
+    eval_pattern = os.path.join(folder_path, 'eval_*.json')
+    traj_pattern = os.path.join(folder_path, 'traj_*.json')
+
+    for filepath in glob.glob(eval_pattern):
+        filename = os.path.basename(filepath)
+        total, result = analyze_eval_json_file(filepath)
+        key = re.search(r'eval_(.+)\.json', filename).group(1)
+        eval_results[key] = (total, result)
+
+    for filepath in glob.glob(traj_pattern):
+        filename = os.path.basename(filepath)
+        steps, cost = analyze_traj_json_file(filepath)
+        key = re.search(r'traj_(.+)\.json', filename).group(1)
+        traj_results[key] = (steps, cost)
+
+    return eval_results, traj_results
+
+
+def get_task_nature_category(task_name: str) -> str:
+    """
+    Get the nature category of the task.
+    """
+    task_nature = task_name.split('-')[0]
+    if task_nature.lower() in ['sde', 'pm', 'ds', 'admin', 'hr', 'finance']:
+        return task_nature
+    else:
+        return 'other'
+
+
+def calculate_score(total: int, result: int) -> float:
+    """
+    Calculate the score as a number between 0 and 1.
+
+    Formula: score = (result / total) * 0.5 + (result // total) * 0.5
+    Explanation:
+    - (result / total) * 0.5: This is the completion ratio, scaled down to a 0-0.5 range.
+    - (result // total) * 0.5: This is a binary score indicating whether the task was completed or not.
+
+    Args:
+        total: Total possible points
+        result: Actual points achieved
+
+    Returns:
+        Score as a number between 0 and 1
+    """
+    return (result / total * 0.5) + (result // total * 0.5)
+
+
+def is_perfect_completion(total: int, result: int) -> bool:
+    """
+    Check if the task achieved perfect completion.
+
+    Args:
+        total: Total possible points
+        result: Actual points achieved
+
+    Returns:
+        True if result equals total, False otherwise
+    """
+    return total > 0 and total == result
+
+
+def main():
+    if len(sys.argv) != 2:
+        print('Usage: poetry run python summarise_results.py <folder_path>')
+        sys.exit(1)
+
+    folder_path = sys.argv[1]
+
+    if not os.path.isdir(folder_path):
+        print(f"Error: '{folder_path}' is not a valid directory")
+        sys.exit(1)
+
+    eval_results, traj_results = analyze_folder(folder_path)
+
+    if not eval_results:
+        print(f'No eval_*.json files found in {folder_path}')
+        return
+
+    # Create list of results with completion ratios for sorting
+    detailed_results = [
+        (
+            task_name,
+            total,
+            result,
+            calculate_score(total, result),
+            is_perfect_completion(total, result),
+            get_task_nature_category(task_name),
+        )
+        for task_name, (total, result) in eval_results.items()
+    ]
+
+    # Sort by score in descending order
+    detailed_results.sort(key=lambda x: (-x[3], x[0]))
+
+    # Calculate perfect completion stats
+    perfect_completions = sum(
+        1 for _, _, _, _, is_perfect, _ in detailed_results if is_perfect
+    )
+
+    # Print header
+    print('\n# Evaluation Results Report')
+    print('\n## Results per File')
+    print('\n*Sorted by score (⭐ indicates perfect completion)*\n')
+
+    # Print table header
+    print(
+        '| Filename | Total | Result | Score | Steps | Cost (assuming no prompt caching)|'
+    )
+    print('|----------|--------|---------|-------|-------|------|')
+
+    # Print individual file results
+    for task_name, total, result, score, is_perfect, task_nature in detailed_results:
+        perfect_marker = ' ⭐' if is_perfect else ''
+        print(
+            f'| {task_name} | {total:,} | {result:,} | {score:.2f}{perfect_marker} | {traj_results[task_name][0]} | {traj_results[task_name][1]:.2f} |'
+        )
+
+    # Print summary section
+    print('\n## Summary\n')
+    print(f'**Tasks Evaluated:** {len(eval_results)}\n')
+    print(
+        f'**Perfect Completions:** {perfect_completions}/{len(eval_results)} ({(perfect_completions/len(eval_results)*100):.2f}%)\n'
+    )
+
+    overall_score = (
+        sum(score for _, _, _, score, _, _ in detailed_results)
+        / len(detailed_results)
+        * 100
+    )
+    avg_steps = sum(steps for steps, _ in traj_results.values()) / len(traj_results)
+    avg_cost = sum(cost for _, cost in traj_results.values()) / len(traj_results)
+    print(f'**Overall Score:** {overall_score:.2f}%\n')
+    print(f'**Average Steps:** {avg_steps:.2f}\n')
+    print(f'**Average Cost (USD):** {avg_cost:.2f}\n')
+
+    # Additional statistics
+    if detailed_results:
+        highest_score = max(score for _, _, _, score, _, _ in detailed_results)
+        lowest_score = min(score for _, _, _, score, _, _ in detailed_results)
+        median_score = detailed_results[len(detailed_results) // 2][3]
+        avg_score = sum(score for _, _, _, score, _, _ in detailed_results) / len(
+            detailed_results
+        )
+
+        print('\n## Statistics\n')
+        print('| Metric | Value |')
+        print('|---------|--------|')
+        print(f'| Highest Task Score | {highest_score*100:.2f}% |')
+        print(f'| Lowest Task Score | {lowest_score*100:.2f}% |')
+        print(f'| Median Task Score | {median_score*100:.2f}% |')
+        print(f'| Average Task Score | {avg_score*100:.2f}% |')
+
+        # compute avg score per nature category
+        print('\n## Statistics per Nature Category\n')
+        print('| Metric | Value |')
+        print('|---------|--------|')
+        for task_nature in ['sde', 'pm', 'ds', 'admin', 'hr', 'finance', 'other']:
+            num_of_tasks = sum(
+                1
+                for _, _, _, _, _, nature_category in detailed_results
+                if nature_category == task_nature
+            )
+            task_nature_score = (
+                sum(
+                    score
+                    for _, _, _, score, _, nature_category in detailed_results
+                    if nature_category == task_nature
+                )
+                / num_of_tasks
+            )
+            perfect_completions = sum(
+                1
+                for _, _, _, _, is_perfect, nature_category in detailed_results
+                if nature_category == task_nature and is_perfect
+            )
+            print(
+                f'| Perfect Completions for {task_nature} | {perfect_completions}/{num_of_tasks} ({perfect_completions/num_of_tasks*100:.2f}%) |'
+            )
+            print(f'| Average Score for {task_nature} | {task_nature_score*100:.2f}% |')
+
+
+if __name__ == '__main__':
+    main()