diff --git a/frontend/src/components/features/conversation-panel/conversation-card.tsx b/frontend/src/components/features/conversation-panel/conversation-card.tsx index 09ce722fad..540dbe8861 100644 --- a/frontend/src/components/features/conversation-panel/conversation-card.tsx +++ b/frontend/src/components/features/conversation-panel/conversation-card.tsx @@ -240,22 +240,67 @@ export function ConversationCard({ title="Metrics Information" testID="metrics-modal" > -
- {metrics?.cost !== null && ( -

Total Cost: ${metrics.cost.toFixed(4)}

- )} - {metrics?.usage !== null && ( - <> -

Tokens Used:

- - +
+ {(metrics?.cost !== null || metrics?.usage !== null) && ( +
+
+ {metrics?.cost !== null && ( +
+ + Total Cost (USD): + + + ${metrics.cost.toFixed(4)} + +
+ )} + + {metrics?.usage !== null && ( + <> +
+ Total Input Tokens: + + {metrics.usage.prompt_tokens.toLocaleString()} + +
+ +
+ Cache Hit: + + {metrics.usage.cache_read_tokens.toLocaleString()} + + Cache Write: + + {metrics.usage.cache_write_tokens.toLocaleString()} + +
+ +
+ Total Output Tokens: + + {metrics.usage.completion_tokens.toLocaleString()} + +
+ +
+ Total Tokens: + + {( + metrics.usage.prompt_tokens + + metrics.usage.completion_tokens + ).toLocaleString()} + +
+ + )} +
+
)} + {!metrics?.cost && !metrics?.usage && ( -

No metrics data available

+
+

No metrics data available

+
)}
diff --git a/frontend/src/services/actions.ts b/frontend/src/services/actions.ts index 1fd2c8b878..e9d6b8fd0b 100644 --- a/frontend/src/services/actions.ts +++ b/frontend/src/services/actions.ts @@ -87,13 +87,10 @@ export function handleActionMessage(message: ActionMessage) { } // Update metrics if available - if ( - message.llm_metrics || - message.tool_call_metadata?.model_response?.usage - ) { + if (message.llm_metrics) { const metrics = { cost: message.llm_metrics?.accumulated_cost ?? null, - usage: message.tool_call_metadata?.model_response?.usage ?? null, + usage: message.llm_metrics?.accumulated_token_usage ?? null, }; store.dispatch(setMetrics(metrics)); } diff --git a/frontend/src/state/metrics-slice.ts b/frontend/src/state/metrics-slice.ts index dd5758e04b..52803072d0 100644 --- a/frontend/src/state/metrics-slice.ts +++ b/frontend/src/state/metrics-slice.ts @@ -5,7 +5,8 @@ interface MetricsState { usage: { prompt_tokens: number; completion_tokens: number; - total_tokens: number; + cache_read_tokens: number; + cache_write_tokens: number; } | null; } diff --git a/frontend/src/types/message.tsx b/frontend/src/types/message.tsx index 999840b0fc..763f292da3 100644 --- a/frontend/src/types/message.tsx +++ b/frontend/src/types/message.tsx @@ -19,6 +19,12 @@ export interface ActionMessage { // LLM metrics information llm_metrics?: { accumulated_cost: number; + accumulated_token_usage: { + prompt_tokens: number; + completion_tokens: number; + cache_read_tokens: number; + cache_write_tokens: number; + }; }; // Tool call metadata diff --git a/openhands/core/schema/action.py b/openhands/core/schema/action.py index d330c39e49..ccd1c14ae1 100644 --- a/openhands/core/schema/action.py +++ b/openhands/core/schema/action.py @@ -42,7 +42,6 @@ class ActionType(str, Enum): """Delegates a task to another agent. """ - THINK = 'think' """Logs a thought. """ diff --git a/openhands/events/serialization/action.py b/openhands/events/serialization/action.py index 587a84a20b..5c314b0f80 100644 --- a/openhands/events/serialization/action.py +++ b/openhands/events/serialization/action.py @@ -1,5 +1,5 @@ -import re from typing import Any + from openhands.core.exceptions import LLMMalformedActionError from openhands.events.action.action import Action from openhands.events.action.agent import ( diff --git a/openhands/events/serialization/event.py b/openhands/events/serialization/event.py index fdc91fbde6..3fcb0393aa 100644 --- a/openhands/events/serialization/event.py +++ b/openhands/events/serialization/event.py @@ -79,6 +79,11 @@ def event_from_dict(data: dict[str, Any]) -> 'Event': metrics.token_usages = [ TokenUsage(**usage) for usage in value.get('token_usages', []) ] + # Set accumulated token usage if available + if 'accumulated_token_usage' in value: + metrics._accumulated_token_usage = TokenUsage( + **value.get('accumulated_token_usage', {}) + ) value = metrics setattr(evt, '_' + key, value) return evt diff --git a/openhands/llm/metrics.py b/openhands/llm/metrics.py index a5ec0efd75..d178fa1768 100644 --- a/openhands/llm/metrics.py +++ b/openhands/llm/metrics.py @@ -20,12 +20,23 @@ class ResponseLatency(BaseModel): class TokenUsage(BaseModel): """Metric tracking detailed token usage per completion call.""" - model: str - prompt_tokens: int - completion_tokens: int - cache_read_tokens: int - cache_write_tokens: int - response_id: str + model: str = Field(default='') + prompt_tokens: int = Field(default=0) + completion_tokens: int = Field(default=0) + cache_read_tokens: int = Field(default=0) + cache_write_tokens: int = Field(default=0) + response_id: str = Field(default='') + + def __add__(self, other: 'TokenUsage') -> 'TokenUsage': + """Add two TokenUsage instances together.""" + return TokenUsage( + model=self.model, + prompt_tokens=self.prompt_tokens + other.prompt_tokens, + completion_tokens=self.completion_tokens + other.completion_tokens, + cache_read_tokens=self.cache_read_tokens + other.cache_read_tokens, + cache_write_tokens=self.cache_write_tokens + other.cache_write_tokens, + response_id=self.response_id, + ) class Metrics: @@ -42,6 +53,14 @@ class Metrics: self._response_latencies: list[ResponseLatency] = [] self.model_name = model_name self._token_usages: list[TokenUsage] = [] + self._accumulated_token_usage: TokenUsage = TokenUsage( + model=model_name, + prompt_tokens=0, + completion_tokens=0, + cache_read_tokens=0, + cache_write_tokens=0, + response_id='', + ) @property def accumulated_cost(self) -> float: @@ -99,15 +118,24 @@ class Metrics: response_id: str, ) -> None: """Add a single usage record.""" - self._token_usages.append( - TokenUsage( - model=self.model_name, - prompt_tokens=prompt_tokens, - completion_tokens=completion_tokens, - cache_read_tokens=cache_read_tokens, - cache_write_tokens=cache_write_tokens, - response_id=response_id, - ) + usage = TokenUsage( + model=self.model_name, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + cache_read_tokens=cache_read_tokens, + cache_write_tokens=cache_write_tokens, + response_id=response_id, + ) + self._token_usages.append(usage) + + # Update accumulated token usage using the __add__ operator + self._accumulated_token_usage = self._accumulated_token_usage + TokenUsage( + model=self.model_name, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + cache_read_tokens=cache_read_tokens, + cache_write_tokens=cache_write_tokens, + response_id='', ) def merge(self, other: 'Metrics') -> None: @@ -118,10 +146,16 @@ class Metrics: self.token_usages += other.token_usages self.response_latencies += other.response_latencies + # Merge accumulated token usage using the __add__ operator + self._accumulated_token_usage = ( + self._accumulated_token_usage + other._accumulated_token_usage + ) + def get(self) -> dict: """Return the metrics in a dictionary.""" return { 'accumulated_cost': self._accumulated_cost, + 'accumulated_token_usage': self._accumulated_token_usage.model_dump(), 'costs': [cost.model_dump() for cost in self._costs], 'response_latencies': [ latency.model_dump() for latency in self._response_latencies @@ -134,6 +168,15 @@ class Metrics: self._costs = [] self._response_latencies = [] self._token_usages = [] + # Reset accumulated token usage with a new instance + self._accumulated_token_usage = TokenUsage( + model=self.model_name, + prompt_tokens=0, + completion_tokens=0, + cache_read_tokens=0, + cache_write_tokens=0, + response_id='', + ) def log(self): """Log the metrics.""" diff --git a/tests/unit/test_llm.py b/tests/unit/test_llm.py index 57906c2c77..87db3bc62e 100644 --- a/tests/unit/test_llm.py +++ b/tests/unit/test_llm.py @@ -13,7 +13,7 @@ from openhands.core.config import LLMConfig from openhands.core.exceptions import OperationCancelled from openhands.core.message import Message, TextContent from openhands.llm.llm import LLM -from openhands.llm.metrics import Metrics +from openhands.llm.metrics import Metrics, TokenUsage @pytest.fixture(autouse=True) @@ -45,6 +45,84 @@ def test_llm_init_with_default_config(default_config): assert llm.metrics.model_name == 'gpt-4o' +def test_token_usage_add(): + """Test that TokenUsage instances can be added together.""" + # Create two TokenUsage instances + usage1 = TokenUsage( + model='model1', + prompt_tokens=10, + completion_tokens=5, + cache_read_tokens=3, + cache_write_tokens=2, + response_id='response-1', + ) + + usage2 = TokenUsage( + model='model2', + prompt_tokens=8, + completion_tokens=6, + cache_read_tokens=2, + cache_write_tokens=4, + response_id='response-2', + ) + + # Add them together + combined = usage1 + usage2 + + # Verify the result + assert combined.model == 'model1' # Should keep the model from the first instance + assert combined.prompt_tokens == 18 # 10 + 8 + assert combined.completion_tokens == 11 # 5 + 6 + assert combined.cache_read_tokens == 5 # 3 + 2 + assert combined.cache_write_tokens == 6 # 2 + 4 + assert ( + combined.response_id == 'response-1' + ) # Should keep the response_id from the first instance + + +def test_metrics_merge_accumulated_token_usage(): + """Test that accumulated token usage is properly merged between two Metrics instances.""" + # Create two Metrics instances + metrics1 = Metrics(model_name='model1') + metrics2 = Metrics(model_name='model2') + + # Add token usage to each + metrics1.add_token_usage(10, 5, 3, 2, 'response-1') + metrics2.add_token_usage(8, 6, 2, 4, 'response-2') + + # Verify initial accumulated token usage + metrics1_data = metrics1.get() + accumulated1 = metrics1_data['accumulated_token_usage'] + assert accumulated1['prompt_tokens'] == 10 + assert accumulated1['completion_tokens'] == 5 + assert accumulated1['cache_read_tokens'] == 3 + assert accumulated1['cache_write_tokens'] == 2 + + metrics2_data = metrics2.get() + accumulated2 = metrics2_data['accumulated_token_usage'] + assert accumulated2['prompt_tokens'] == 8 + assert accumulated2['completion_tokens'] == 6 + assert accumulated2['cache_read_tokens'] == 2 + assert accumulated2['cache_write_tokens'] == 4 + + # Merge metrics2 into metrics1 + metrics1.merge(metrics2) + + # Verify merged accumulated token usage + merged_data = metrics1.get() + merged_accumulated = merged_data['accumulated_token_usage'] + assert merged_accumulated['prompt_tokens'] == 18 # 10 + 8 + assert merged_accumulated['completion_tokens'] == 11 # 5 + 6 + assert merged_accumulated['cache_read_tokens'] == 5 # 3 + 2 + assert merged_accumulated['cache_write_tokens'] == 6 # 2 + 4 + + # Verify individual token usage records are maintained + token_usages = merged_data['token_usages'] + assert len(token_usages) == 2 + assert token_usages[0]['response_id'] == 'response-1' + assert token_usages[1]['response_id'] == 'response-2' + + @patch('openhands.llm.llm.litellm.get_model_info') def test_llm_init_with_model_info(mock_get_model_info, default_config): mock_get_model_info.return_value = { @@ -140,12 +218,22 @@ def test_llm_reset(): initial_metrics = copy.deepcopy(llm.metrics) initial_metrics.add_cost(1.0) initial_metrics.add_response_latency(0.5, 'test-id') + initial_metrics.add_token_usage(10, 5, 3, 2, 'test-id') llm.reset() assert llm.metrics.accumulated_cost != initial_metrics.accumulated_cost assert llm.metrics.costs != initial_metrics.costs assert llm.metrics.response_latencies != initial_metrics.response_latencies + assert llm.metrics.token_usages != initial_metrics.token_usages assert isinstance(llm.metrics, Metrics) + # Check that accumulated token usage is reset + metrics_data = llm.metrics.get() + accumulated_usage = metrics_data['accumulated_token_usage'] + assert accumulated_usage['prompt_tokens'] == 0 + assert accumulated_usage['completion_tokens'] == 0 + assert accumulated_usage['cache_read_tokens'] == 0 + assert accumulated_usage['cache_write_tokens'] == 0 + @patch('openhands.llm.llm.litellm.get_model_info') def test_llm_init_with_openrouter_model(mock_get_model_info, default_config): @@ -493,6 +581,82 @@ def test_llm_token_usage(mock_litellm_completion, default_config): assert usage_entry_2['response_id'] == 'test-response-usage-2' +@patch('openhands.llm.llm.litellm_completion') +def test_accumulated_token_usage(mock_litellm_completion, default_config): + """Test that token usage is properly accumulated across multiple LLM calls.""" + # Mock responses with token usage information + mock_response_1 = { + 'id': 'test-response-1', + 'choices': [{'message': {'content': 'First response'}}], + 'usage': { + 'prompt_tokens': 10, + 'completion_tokens': 5, + 'prompt_tokens_details': PromptTokensDetails(cached_tokens=3), + 'model_extra': {'cache_creation_input_tokens': 4}, + }, + } + + mock_response_2 = { + 'id': 'test-response-2', + 'choices': [{'message': {'content': 'Second response'}}], + 'usage': { + 'prompt_tokens': 8, + 'completion_tokens': 6, + 'prompt_tokens_details': PromptTokensDetails(cached_tokens=2), + 'model_extra': {'cache_creation_input_tokens': 3}, + }, + } + + # Set up the mock to return these responses in sequence + mock_litellm_completion.side_effect = [mock_response_1, mock_response_2] + + # Create LLM instance + llm = LLM(config=default_config) + + # First call + llm.completion(messages=[{'role': 'user', 'content': 'First message'}]) + + # Check accumulated token usage after first call + metrics_data = llm.metrics.get() + accumulated_usage = metrics_data['accumulated_token_usage'] + + assert accumulated_usage['prompt_tokens'] == 10 + assert accumulated_usage['completion_tokens'] == 5 + assert accumulated_usage['cache_read_tokens'] == 3 + assert accumulated_usage['cache_write_tokens'] == 4 + + # Second call + llm.completion(messages=[{'role': 'user', 'content': 'Second message'}]) + + # Check accumulated token usage after second call + metrics_data = llm.metrics.get() + accumulated_usage = metrics_data['accumulated_token_usage'] + + # Values should be the sum of both calls + assert accumulated_usage['prompt_tokens'] == 18 # 10 + 8 + assert accumulated_usage['completion_tokens'] == 11 # 5 + 6 + assert accumulated_usage['cache_read_tokens'] == 5 # 3 + 2 + assert accumulated_usage['cache_write_tokens'] == 7 # 4 + 3 + + # Verify individual token usage records are still maintained + token_usages = metrics_data['token_usages'] + assert len(token_usages) == 2 + + # First record + assert token_usages[0]['prompt_tokens'] == 10 + assert token_usages[0]['completion_tokens'] == 5 + assert token_usages[0]['cache_read_tokens'] == 3 + assert token_usages[0]['cache_write_tokens'] == 4 + assert token_usages[0]['response_id'] == 'test-response-1' + + # Second record + assert token_usages[1]['prompt_tokens'] == 8 + assert token_usages[1]['completion_tokens'] == 6 + assert token_usages[1]['cache_read_tokens'] == 2 + assert token_usages[1]['cache_write_tokens'] == 3 + assert token_usages[1]['response_id'] == 'test-response-2' + + @patch('openhands.llm.llm.litellm_completion') def test_completion_with_log_completions(mock_litellm_completion, default_config): with tempfile.TemporaryDirectory() as temp_dir: