diff --git a/frontend/src/components/features/conversation-panel/conversation-card.tsx b/frontend/src/components/features/conversation-panel/conversation-card.tsx
index 09ce722fad..540dbe8861 100644
--- a/frontend/src/components/features/conversation-panel/conversation-card.tsx
+++ b/frontend/src/components/features/conversation-panel/conversation-card.tsx
@@ -240,22 +240,67 @@ export function ConversationCard({
title="Metrics Information"
testID="metrics-modal"
>
-
- {metrics?.cost !== null && (
-
Total Cost: ${metrics.cost.toFixed(4)}
- )}
- {metrics?.usage !== null && (
- <>
-
Tokens Used:
-
- - - Input: {metrics.usage.prompt_tokens}
- - - Output: {metrics.usage.completion_tokens}
- - - Total: {metrics.usage.total_tokens}
-
- >
+
+ {(metrics?.cost !== null || metrics?.usage !== null) && (
+
+
+ {metrics?.cost !== null && (
+
+
+ Total Cost (USD):
+
+
+ ${metrics.cost.toFixed(4)}
+
+
+ )}
+
+ {metrics?.usage !== null && (
+ <>
+
+ Total Input Tokens:
+
+ {metrics.usage.prompt_tokens.toLocaleString()}
+
+
+
+
+ Cache Hit:
+
+ {metrics.usage.cache_read_tokens.toLocaleString()}
+
+ Cache Write:
+
+ {metrics.usage.cache_write_tokens.toLocaleString()}
+
+
+
+
+ Total Output Tokens:
+
+ {metrics.usage.completion_tokens.toLocaleString()}
+
+
+
+
+ Total Tokens:
+
+ {(
+ metrics.usage.prompt_tokens +
+ metrics.usage.completion_tokens
+ ).toLocaleString()}
+
+
+ >
+ )}
+
+
)}
+
{!metrics?.cost && !metrics?.usage && (
-
No metrics data available
+
+
No metrics data available
+
)}
diff --git a/frontend/src/services/actions.ts b/frontend/src/services/actions.ts
index 1fd2c8b878..e9d6b8fd0b 100644
--- a/frontend/src/services/actions.ts
+++ b/frontend/src/services/actions.ts
@@ -87,13 +87,10 @@ export function handleActionMessage(message: ActionMessage) {
}
// Update metrics if available
- if (
- message.llm_metrics ||
- message.tool_call_metadata?.model_response?.usage
- ) {
+ if (message.llm_metrics) {
const metrics = {
cost: message.llm_metrics?.accumulated_cost ?? null,
- usage: message.tool_call_metadata?.model_response?.usage ?? null,
+ usage: message.llm_metrics?.accumulated_token_usage ?? null,
};
store.dispatch(setMetrics(metrics));
}
diff --git a/frontend/src/state/metrics-slice.ts b/frontend/src/state/metrics-slice.ts
index dd5758e04b..52803072d0 100644
--- a/frontend/src/state/metrics-slice.ts
+++ b/frontend/src/state/metrics-slice.ts
@@ -5,7 +5,8 @@ interface MetricsState {
usage: {
prompt_tokens: number;
completion_tokens: number;
- total_tokens: number;
+ cache_read_tokens: number;
+ cache_write_tokens: number;
} | null;
}
diff --git a/frontend/src/types/message.tsx b/frontend/src/types/message.tsx
index 999840b0fc..763f292da3 100644
--- a/frontend/src/types/message.tsx
+++ b/frontend/src/types/message.tsx
@@ -19,6 +19,12 @@ export interface ActionMessage {
// LLM metrics information
llm_metrics?: {
accumulated_cost: number;
+ accumulated_token_usage: {
+ prompt_tokens: number;
+ completion_tokens: number;
+ cache_read_tokens: number;
+ cache_write_tokens: number;
+ };
};
// Tool call metadata
diff --git a/openhands/core/schema/action.py b/openhands/core/schema/action.py
index d330c39e49..ccd1c14ae1 100644
--- a/openhands/core/schema/action.py
+++ b/openhands/core/schema/action.py
@@ -42,7 +42,6 @@ class ActionType(str, Enum):
"""Delegates a task to another agent.
"""
-
THINK = 'think'
"""Logs a thought.
"""
diff --git a/openhands/events/serialization/action.py b/openhands/events/serialization/action.py
index 587a84a20b..5c314b0f80 100644
--- a/openhands/events/serialization/action.py
+++ b/openhands/events/serialization/action.py
@@ -1,5 +1,5 @@
-import re
from typing import Any
+
from openhands.core.exceptions import LLMMalformedActionError
from openhands.events.action.action import Action
from openhands.events.action.agent import (
diff --git a/openhands/events/serialization/event.py b/openhands/events/serialization/event.py
index fdc91fbde6..3fcb0393aa 100644
--- a/openhands/events/serialization/event.py
+++ b/openhands/events/serialization/event.py
@@ -79,6 +79,11 @@ def event_from_dict(data: dict[str, Any]) -> 'Event':
metrics.token_usages = [
TokenUsage(**usage) for usage in value.get('token_usages', [])
]
+ # Set accumulated token usage if available
+ if 'accumulated_token_usage' in value:
+ metrics._accumulated_token_usage = TokenUsage(
+ **value.get('accumulated_token_usage', {})
+ )
value = metrics
setattr(evt, '_' + key, value)
return evt
diff --git a/openhands/llm/metrics.py b/openhands/llm/metrics.py
index a5ec0efd75..d178fa1768 100644
--- a/openhands/llm/metrics.py
+++ b/openhands/llm/metrics.py
@@ -20,12 +20,23 @@ class ResponseLatency(BaseModel):
class TokenUsage(BaseModel):
"""Metric tracking detailed token usage per completion call."""
- model: str
- prompt_tokens: int
- completion_tokens: int
- cache_read_tokens: int
- cache_write_tokens: int
- response_id: str
+ model: str = Field(default='')
+ prompt_tokens: int = Field(default=0)
+ completion_tokens: int = Field(default=0)
+ cache_read_tokens: int = Field(default=0)
+ cache_write_tokens: int = Field(default=0)
+ response_id: str = Field(default='')
+
+ def __add__(self, other: 'TokenUsage') -> 'TokenUsage':
+ """Add two TokenUsage instances together."""
+ return TokenUsage(
+ model=self.model,
+ prompt_tokens=self.prompt_tokens + other.prompt_tokens,
+ completion_tokens=self.completion_tokens + other.completion_tokens,
+ cache_read_tokens=self.cache_read_tokens + other.cache_read_tokens,
+ cache_write_tokens=self.cache_write_tokens + other.cache_write_tokens,
+ response_id=self.response_id,
+ )
class Metrics:
@@ -42,6 +53,14 @@ class Metrics:
self._response_latencies: list[ResponseLatency] = []
self.model_name = model_name
self._token_usages: list[TokenUsage] = []
+ self._accumulated_token_usage: TokenUsage = TokenUsage(
+ model=model_name,
+ prompt_tokens=0,
+ completion_tokens=0,
+ cache_read_tokens=0,
+ cache_write_tokens=0,
+ response_id='',
+ )
@property
def accumulated_cost(self) -> float:
@@ -99,15 +118,24 @@ class Metrics:
response_id: str,
) -> None:
"""Add a single usage record."""
- self._token_usages.append(
- TokenUsage(
- model=self.model_name,
- prompt_tokens=prompt_tokens,
- completion_tokens=completion_tokens,
- cache_read_tokens=cache_read_tokens,
- cache_write_tokens=cache_write_tokens,
- response_id=response_id,
- )
+ usage = TokenUsage(
+ model=self.model_name,
+ prompt_tokens=prompt_tokens,
+ completion_tokens=completion_tokens,
+ cache_read_tokens=cache_read_tokens,
+ cache_write_tokens=cache_write_tokens,
+ response_id=response_id,
+ )
+ self._token_usages.append(usage)
+
+ # Update accumulated token usage using the __add__ operator
+ self._accumulated_token_usage = self._accumulated_token_usage + TokenUsage(
+ model=self.model_name,
+ prompt_tokens=prompt_tokens,
+ completion_tokens=completion_tokens,
+ cache_read_tokens=cache_read_tokens,
+ cache_write_tokens=cache_write_tokens,
+ response_id='',
)
def merge(self, other: 'Metrics') -> None:
@@ -118,10 +146,16 @@ class Metrics:
self.token_usages += other.token_usages
self.response_latencies += other.response_latencies
+ # Merge accumulated token usage using the __add__ operator
+ self._accumulated_token_usage = (
+ self._accumulated_token_usage + other._accumulated_token_usage
+ )
+
def get(self) -> dict:
"""Return the metrics in a dictionary."""
return {
'accumulated_cost': self._accumulated_cost,
+ 'accumulated_token_usage': self._accumulated_token_usage.model_dump(),
'costs': [cost.model_dump() for cost in self._costs],
'response_latencies': [
latency.model_dump() for latency in self._response_latencies
@@ -134,6 +168,15 @@ class Metrics:
self._costs = []
self._response_latencies = []
self._token_usages = []
+ # Reset accumulated token usage with a new instance
+ self._accumulated_token_usage = TokenUsage(
+ model=self.model_name,
+ prompt_tokens=0,
+ completion_tokens=0,
+ cache_read_tokens=0,
+ cache_write_tokens=0,
+ response_id='',
+ )
def log(self):
"""Log the metrics."""
diff --git a/tests/unit/test_llm.py b/tests/unit/test_llm.py
index 57906c2c77..87db3bc62e 100644
--- a/tests/unit/test_llm.py
+++ b/tests/unit/test_llm.py
@@ -13,7 +13,7 @@ from openhands.core.config import LLMConfig
from openhands.core.exceptions import OperationCancelled
from openhands.core.message import Message, TextContent
from openhands.llm.llm import LLM
-from openhands.llm.metrics import Metrics
+from openhands.llm.metrics import Metrics, TokenUsage
@pytest.fixture(autouse=True)
@@ -45,6 +45,84 @@ def test_llm_init_with_default_config(default_config):
assert llm.metrics.model_name == 'gpt-4o'
+def test_token_usage_add():
+ """Test that TokenUsage instances can be added together."""
+ # Create two TokenUsage instances
+ usage1 = TokenUsage(
+ model='model1',
+ prompt_tokens=10,
+ completion_tokens=5,
+ cache_read_tokens=3,
+ cache_write_tokens=2,
+ response_id='response-1',
+ )
+
+ usage2 = TokenUsage(
+ model='model2',
+ prompt_tokens=8,
+ completion_tokens=6,
+ cache_read_tokens=2,
+ cache_write_tokens=4,
+ response_id='response-2',
+ )
+
+ # Add them together
+ combined = usage1 + usage2
+
+ # Verify the result
+ assert combined.model == 'model1' # Should keep the model from the first instance
+ assert combined.prompt_tokens == 18 # 10 + 8
+ assert combined.completion_tokens == 11 # 5 + 6
+ assert combined.cache_read_tokens == 5 # 3 + 2
+ assert combined.cache_write_tokens == 6 # 2 + 4
+ assert (
+ combined.response_id == 'response-1'
+ ) # Should keep the response_id from the first instance
+
+
+def test_metrics_merge_accumulated_token_usage():
+ """Test that accumulated token usage is properly merged between two Metrics instances."""
+ # Create two Metrics instances
+ metrics1 = Metrics(model_name='model1')
+ metrics2 = Metrics(model_name='model2')
+
+ # Add token usage to each
+ metrics1.add_token_usage(10, 5, 3, 2, 'response-1')
+ metrics2.add_token_usage(8, 6, 2, 4, 'response-2')
+
+ # Verify initial accumulated token usage
+ metrics1_data = metrics1.get()
+ accumulated1 = metrics1_data['accumulated_token_usage']
+ assert accumulated1['prompt_tokens'] == 10
+ assert accumulated1['completion_tokens'] == 5
+ assert accumulated1['cache_read_tokens'] == 3
+ assert accumulated1['cache_write_tokens'] == 2
+
+ metrics2_data = metrics2.get()
+ accumulated2 = metrics2_data['accumulated_token_usage']
+ assert accumulated2['prompt_tokens'] == 8
+ assert accumulated2['completion_tokens'] == 6
+ assert accumulated2['cache_read_tokens'] == 2
+ assert accumulated2['cache_write_tokens'] == 4
+
+ # Merge metrics2 into metrics1
+ metrics1.merge(metrics2)
+
+ # Verify merged accumulated token usage
+ merged_data = metrics1.get()
+ merged_accumulated = merged_data['accumulated_token_usage']
+ assert merged_accumulated['prompt_tokens'] == 18 # 10 + 8
+ assert merged_accumulated['completion_tokens'] == 11 # 5 + 6
+ assert merged_accumulated['cache_read_tokens'] == 5 # 3 + 2
+ assert merged_accumulated['cache_write_tokens'] == 6 # 2 + 4
+
+ # Verify individual token usage records are maintained
+ token_usages = merged_data['token_usages']
+ assert len(token_usages) == 2
+ assert token_usages[0]['response_id'] == 'response-1'
+ assert token_usages[1]['response_id'] == 'response-2'
+
+
@patch('openhands.llm.llm.litellm.get_model_info')
def test_llm_init_with_model_info(mock_get_model_info, default_config):
mock_get_model_info.return_value = {
@@ -140,12 +218,22 @@ def test_llm_reset():
initial_metrics = copy.deepcopy(llm.metrics)
initial_metrics.add_cost(1.0)
initial_metrics.add_response_latency(0.5, 'test-id')
+ initial_metrics.add_token_usage(10, 5, 3, 2, 'test-id')
llm.reset()
assert llm.metrics.accumulated_cost != initial_metrics.accumulated_cost
assert llm.metrics.costs != initial_metrics.costs
assert llm.metrics.response_latencies != initial_metrics.response_latencies
+ assert llm.metrics.token_usages != initial_metrics.token_usages
assert isinstance(llm.metrics, Metrics)
+ # Check that accumulated token usage is reset
+ metrics_data = llm.metrics.get()
+ accumulated_usage = metrics_data['accumulated_token_usage']
+ assert accumulated_usage['prompt_tokens'] == 0
+ assert accumulated_usage['completion_tokens'] == 0
+ assert accumulated_usage['cache_read_tokens'] == 0
+ assert accumulated_usage['cache_write_tokens'] == 0
+
@patch('openhands.llm.llm.litellm.get_model_info')
def test_llm_init_with_openrouter_model(mock_get_model_info, default_config):
@@ -493,6 +581,82 @@ def test_llm_token_usage(mock_litellm_completion, default_config):
assert usage_entry_2['response_id'] == 'test-response-usage-2'
+@patch('openhands.llm.llm.litellm_completion')
+def test_accumulated_token_usage(mock_litellm_completion, default_config):
+ """Test that token usage is properly accumulated across multiple LLM calls."""
+ # Mock responses with token usage information
+ mock_response_1 = {
+ 'id': 'test-response-1',
+ 'choices': [{'message': {'content': 'First response'}}],
+ 'usage': {
+ 'prompt_tokens': 10,
+ 'completion_tokens': 5,
+ 'prompt_tokens_details': PromptTokensDetails(cached_tokens=3),
+ 'model_extra': {'cache_creation_input_tokens': 4},
+ },
+ }
+
+ mock_response_2 = {
+ 'id': 'test-response-2',
+ 'choices': [{'message': {'content': 'Second response'}}],
+ 'usage': {
+ 'prompt_tokens': 8,
+ 'completion_tokens': 6,
+ 'prompt_tokens_details': PromptTokensDetails(cached_tokens=2),
+ 'model_extra': {'cache_creation_input_tokens': 3},
+ },
+ }
+
+ # Set up the mock to return these responses in sequence
+ mock_litellm_completion.side_effect = [mock_response_1, mock_response_2]
+
+ # Create LLM instance
+ llm = LLM(config=default_config)
+
+ # First call
+ llm.completion(messages=[{'role': 'user', 'content': 'First message'}])
+
+ # Check accumulated token usage after first call
+ metrics_data = llm.metrics.get()
+ accumulated_usage = metrics_data['accumulated_token_usage']
+
+ assert accumulated_usage['prompt_tokens'] == 10
+ assert accumulated_usage['completion_tokens'] == 5
+ assert accumulated_usage['cache_read_tokens'] == 3
+ assert accumulated_usage['cache_write_tokens'] == 4
+
+ # Second call
+ llm.completion(messages=[{'role': 'user', 'content': 'Second message'}])
+
+ # Check accumulated token usage after second call
+ metrics_data = llm.metrics.get()
+ accumulated_usage = metrics_data['accumulated_token_usage']
+
+ # Values should be the sum of both calls
+ assert accumulated_usage['prompt_tokens'] == 18 # 10 + 8
+ assert accumulated_usage['completion_tokens'] == 11 # 5 + 6
+ assert accumulated_usage['cache_read_tokens'] == 5 # 3 + 2
+ assert accumulated_usage['cache_write_tokens'] == 7 # 4 + 3
+
+ # Verify individual token usage records are still maintained
+ token_usages = metrics_data['token_usages']
+ assert len(token_usages) == 2
+
+ # First record
+ assert token_usages[0]['prompt_tokens'] == 10
+ assert token_usages[0]['completion_tokens'] == 5
+ assert token_usages[0]['cache_read_tokens'] == 3
+ assert token_usages[0]['cache_write_tokens'] == 4
+ assert token_usages[0]['response_id'] == 'test-response-1'
+
+ # Second record
+ assert token_usages[1]['prompt_tokens'] == 8
+ assert token_usages[1]['completion_tokens'] == 6
+ assert token_usages[1]['cache_read_tokens'] == 2
+ assert token_usages[1]['cache_write_tokens'] == 3
+ assert token_usages[1]['response_id'] == 'test-response-2'
+
+
@patch('openhands.llm.llm.litellm_completion')
def test_completion_with_log_completions(mock_litellm_completion, default_config):
with tempfile.TemporaryDirectory() as temp_dir: