From b082ccc0fbe6e2b11507e23561d70aebb2b2d0d3 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Tue, 26 Aug 2025 23:03:35 -0400
Subject: [PATCH] feat(llm): add support for deepseek and gpt-5-mini, util for
 token count (#10626)

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .../swe_bench/scripts/rollout_swegym.sh       |   1 +
 .../utils/scripts/aggregate_token_usage.py    | 209 ++++++++++++++++++
 openhands/llm/model_features.py               |   5 +-
 3 files changed, 212 insertions(+), 3 deletions(-)
 create mode 100755 evaluation/utils/scripts/aggregate_token_usage.py
diff --git a/evaluation/benchmarks/swe_bench/scripts/rollout_swegym.sh b/evaluation/benchmarks/swe_bench/scripts/rollout_swegym.sh
index 63e5609a76..ae9ea9c8e4 100755
--- a/evaluation/benchmarks/swe_bench/scripts/rollout_swegym.sh
+++ b/evaluation/benchmarks/swe_bench/scripts/rollout_swegym.sh
@@ -13,6 +13,7 @@ N_RUNS=${4:-1}
 export EXP_NAME=$EXP_NAME
 # use 2x resources for rollout since some codebases are pretty resource-intensive
 export DEFAULT_RUNTIME_RESOURCE_FACTOR=2
+export ITERATIVE_EVAL_MODE=false
 echo "MODEL: $MODEL"
 echo "EXP_NAME: $EXP_NAME"
 DATASET="SWE-Gym/SWE-Gym"  # change this to the "/SWE-Gym-Lite" if you want to rollout the lite subset
diff --git a/evaluation/utils/scripts/aggregate_token_usage.py b/evaluation/utils/scripts/aggregate_token_usage.py
new file mode 100755
index 0000000000..db27768019
--- /dev/null
+++ b/evaluation/utils/scripts/aggregate_token_usage.py
@@ -0,0 +1,209 @@
+#!/usr/bin/env python3
+"""
+Script to aggregate token usage metrics from LLM completion files.
+
+Usage:
+    python aggregate_token_usage.py <directory_path> [--input-cost <cost>] [--output-cost <cost>] [--cached-cost <cost>]
+
+Arguments:
+    directory_path: Path to the directory containing completion files
+    --input-cost: Cost per input token (default: 0.0)
+    --output-cost: Cost per output token (default: 0.0)
+    --cached-cost: Cost per cached token (default: 0.0)
+"""
+
+import argparse
+import json
+import os
+from pathlib import Path
+
+
+def aggregate_token_usage(
+    directory_path, input_cost=0.0, output_cost=0.0, cached_cost=0.0
+):
+    """
+    Aggregate token usage metrics from all JSON completion files in the directory.
+
+    Args:
+        directory_path (str): Path to directory containing completion files
+        input_cost (float): Cost per input token
+        output_cost (float): Cost per output token
+        cached_cost (float): Cost per cached token
+    """
+
+    # Initialize counters
+    totals = {
+        'input_tokens': 0,
+        'output_tokens': 0,
+        'cached_tokens': 0,
+        'total_tokens': 0,
+        'files_processed': 0,
+        'files_with_errors': 0,
+        'cost': 0,
+    }
+
+    # Find all JSON files recursively
+    json_files = list(Path(directory_path).rglob('*.json'))
+
+    print(f'Found {len(json_files)} JSON files to process...')
+
+    for json_file in json_files:
+        try:
+            with open(json_file, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+
+            # Look for usage data in response or fncall_response
+            usage_data = None
+            if (
+                'response' in data
+                and isinstance(data['response'], dict)
+                and 'usage' in data['response']
+            ):
+                usage_data = data['response']['usage']
+            elif (
+                'fncall_response' in data
+                and isinstance(data['fncall_response'], dict)
+                and 'usage' in data['fncall_response']
+            ):
+                usage_data = data['fncall_response']['usage']
+
+            if usage_data:
+                # Extract token counts
+                completion_tokens = usage_data.get('completion_tokens', 0)
+                prompt_tokens = usage_data.get('prompt_tokens', 0)
+                cached_tokens = usage_data.get('cached_tokens', 0)
+
+                # Handle cases where cached_tokens might be in prompt_tokens_details
+                if cached_tokens == 0 and 'prompt_tokens_details' in usage_data:
+                    details = usage_data['prompt_tokens_details']
+                    if isinstance(details, dict) and 'cached_tokens' in details:
+                        cached_tokens = details.get('cached_tokens', 0) or 0
+
+                # Calculate non-cached input tokens
+                non_cached_input = prompt_tokens - cached_tokens
+
+                # Update totals
+                totals['input_tokens'] += non_cached_input
+                totals['output_tokens'] += completion_tokens
+                totals['cached_tokens'] += cached_tokens
+                totals['total_tokens'] += prompt_tokens + completion_tokens
+
+            if 'cost' in data:
+                totals['cost'] += data['cost']
+            totals['files_processed'] += 1
+
+            # Progress indicator
+            if totals['files_processed'] % 1000 == 0:
+                print(f'Processed {totals["files_processed"]} files...')
+
+        except Exception as e:
+            totals['files_with_errors'] += 1
+            if totals['files_with_errors'] <= 5:  # Only show first 5 errors
+                print(f'Error processing {json_file}: {e}')
+
+    # Calculate costs
+    input_cost_total = totals['input_tokens'] * input_cost
+    output_cost_total = totals['output_tokens'] * output_cost
+    cached_cost_total = totals['cached_tokens'] * cached_cost
+    total_cost = input_cost_total + output_cost_total + cached_cost_total
+
+    # Print results
+    print('\n' + '=' * 60)
+    print('TOKEN USAGE AGGREGATION RESULTS')
+    print('=' * 60)
+    print(f'Files processed: {totals["files_processed"]:,}')
+    print(f'Files with errors: {totals["files_with_errors"]:,}')
+    print()
+    print('TOKEN COUNTS:')
+    print(f'  Input tokens (non-cached):             {totals["input_tokens"]:,}')
+    print(f'  Output tokens:                         {totals["output_tokens"]:,}')
+    print(f'  Cached tokens:                         {totals["cached_tokens"]:,}')
+    print(f'  Total tokens:                          {totals["total_tokens"]:,}')
+    print(f'  Total costs (based on returned value): ${totals["cost"]:.6f}')
+    print()
+
+    if input_cost > 0 or output_cost > 0 or cached_cost > 0:
+        print('COST CALCULATED BASED ON PROVIDED RATE:')
+        print(
+            f'  Input cost:   ${input_cost_total:.6f} ({totals["input_tokens"]:,} × ${input_cost:.6f})'
+        )
+        print(
+            f'  Output cost:  ${output_cost_total:.6f} ({totals["output_tokens"]:,} × ${output_cost:.6f})'
+        )
+        print(
+            f'  Cached cost:  ${cached_cost_total:.6f} ({totals["cached_tokens"]:,} × ${cached_cost:.6f})'
+        )
+        print(f'  Total cost:   ${total_cost:.6f}')
+        print()
+
+    print('SUMMARY:')
+    print(
+        f'  Total input tokens:  {totals["input_tokens"] + totals["cached_tokens"]:,}'
+    )
+    print(f'  Total output tokens: {totals["output_tokens"]:,}')
+    print(f'  Grand total tokens:  {totals["total_tokens"]:,}')
+
+    return totals
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Aggregate token usage metrics from LLM completion files',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  python aggregate_token_usage.py /path/to/completions
+  python aggregate_token_usage.py /path/to/completions --input-cost 0.000001 --output-cost 0.000002
+  python aggregate_token_usage.py /path/to/completions --input-cost 0.000001 --output-cost 0.000002 --cached-cost 0.0000005
+        """,
+    )
+
+    parser.add_argument(
+        'directory_path', help='Path to directory containing completion files'
+    )
+
+    parser.add_argument(
+        '--input-cost',
+        type=float,
+        default=0.0,
+        help='Cost per input token (default: 0.0)',
+    )
+
+    parser.add_argument(
+        '--output-cost',
+        type=float,
+        default=0.0,
+        help='Cost per output token (default: 0.0)',
+    )
+
+    parser.add_argument(
+        '--cached-cost',
+        type=float,
+        default=0.0,
+        help='Cost per cached token (default: 0.0)',
+    )
+
+    args = parser.parse_args()
+
+    # Validate directory path
+    if not os.path.exists(args.directory_path):
+        print(f"Error: Directory '{args.directory_path}' does not exist.")
+        return 1
+
+    if not os.path.isdir(args.directory_path):
+        print(f"Error: '{args.directory_path}' is not a directory.")
+        return 1
+
+    # Run aggregation
+    try:
+        aggregate_token_usage(
+            args.directory_path, args.input_cost, args.output_cost, args.cached_cost
+        )
+        return 0
+    except Exception as e:
+        print(f'Error during aggregation: {e}')
+        return 1
+
+
+if __name__ == '__main__':
+    exit(main())
diff --git a/openhands/llm/model_features.py b/openhands/llm/model_features.py
index 1234e7a3c3..6f5d1b349c 100644
--- a/openhands/llm/model_features.py
+++ b/openhands/llm/model_features.py
@@ -84,6 +84,7 @@ FUNCTION_CALLING_PATTERNS: list[str] = [
     'kimi-k2-instruct',
     'qwen3-coder*',
     'qwen3-coder-480b-a35b-instruct',
+    'deepseek-chat',
 ]
 
 REASONING_EFFORT_PATTERNS: list[str] = [
@@ -98,9 +99,7 @@ REASONING_EFFORT_PATTERNS: list[str] = [
     'o4-mini-2025-04-16',
     'gemini-2.5-flash',
     'gemini-2.5-pro',
-    'gpt-5',
-    'gpt-5-2025-08-07',
-    'gpt-5-mini-2025-08-07',
+    'gpt-5*',
     # DeepSeek reasoning family
     'deepseek-r1-0528*',
 ]