mirror of
https://github.com/OpenHands/OpenHands.git
synced 2025-12-26 05:48:36 +08:00
feat(eval): Response Latency Tracking (#5588)
Co-authored-by: openhands <openhands@all-hands.dev> Co-authored-by: Calvin Smith <calvin@all-hands.dev>
This commit is contained in:
parent
e0626a5741
commit
7ef6fa666d
@ -199,9 +199,17 @@ class LLM(RetryMixin, DebugMixin):
|
||||
}
|
||||
|
||||
try:
|
||||
# Record start time for latency measurement
|
||||
start_time = time.time()
|
||||
|
||||
# we don't support streaming here, thus we get a ModelResponse
|
||||
resp: ModelResponse = self._completion_unwrapped(*args, **kwargs)
|
||||
|
||||
# Calculate and record latency
|
||||
latency = time.time() - start_time
|
||||
response_id = resp.get('id', 'unknown')
|
||||
self.metrics.add_response_latency(latency, response_id)
|
||||
|
||||
non_fncall_response = copy.deepcopy(resp)
|
||||
if mock_function_calling:
|
||||
assert len(resp.choices) == 1
|
||||
@ -436,6 +444,11 @@ class LLM(RetryMixin, DebugMixin):
|
||||
self.metrics.accumulated_cost,
|
||||
)
|
||||
|
||||
# Add latency to stats if available
|
||||
if self.metrics.response_latencies:
|
||||
latest_latency = self.metrics.response_latencies[-1]
|
||||
stats += 'Response Latency: %.3f seconds\n' % latest_latency.latency
|
||||
|
||||
usage: Usage | None = response.get('usage')
|
||||
|
||||
if usage:
|
||||
|
||||
@ -9,15 +9,25 @@ class Cost(BaseModel):
|
||||
timestamp: float = Field(default_factory=time.time)
|
||||
|
||||
|
||||
class ResponseLatency(BaseModel):
|
||||
"""Metric tracking the round-trip time per completion call."""
|
||||
|
||||
model: str
|
||||
latency: float
|
||||
response_id: str
|
||||
|
||||
|
||||
class Metrics:
|
||||
"""Metrics class can record various metrics during running and evaluation.
|
||||
Currently, we define the following metrics:
|
||||
accumulated_cost: the total cost (USD $) of the current LLM.
|
||||
response_latency: the time taken for each LLM completion call.
|
||||
"""
|
||||
|
||||
def __init__(self, model_name: str = 'default') -> None:
|
||||
self._accumulated_cost: float = 0.0
|
||||
self._costs: list[Cost] = []
|
||||
self._response_latencies: list[ResponseLatency] = []
|
||||
self.model_name = model_name
|
||||
|
||||
@property
|
||||
@ -34,26 +44,44 @@ class Metrics:
|
||||
def costs(self) -> list[Cost]:
|
||||
return self._costs
|
||||
|
||||
@property
|
||||
def response_latencies(self) -> list[ResponseLatency]:
|
||||
return self._response_latencies
|
||||
|
||||
def add_cost(self, value: float) -> None:
|
||||
if value < 0:
|
||||
raise ValueError('Added cost cannot be negative.')
|
||||
self._accumulated_cost += value
|
||||
self._costs.append(Cost(cost=value, model=self.model_name))
|
||||
|
||||
def add_response_latency(self, value: float, response_id: str) -> None:
|
||||
if value < 0:
|
||||
raise ValueError('Response latency cannot be negative.')
|
||||
self._response_latencies.append(
|
||||
ResponseLatency(
|
||||
latency=value, model=self.model_name, response_id=response_id
|
||||
)
|
||||
)
|
||||
|
||||
def merge(self, other: 'Metrics') -> None:
|
||||
self._accumulated_cost += other.accumulated_cost
|
||||
self._costs += other._costs
|
||||
self._response_latencies += other._response_latencies
|
||||
|
||||
def get(self) -> dict:
|
||||
"""Return the metrics in a dictionary."""
|
||||
return {
|
||||
'accumulated_cost': self._accumulated_cost,
|
||||
'costs': [cost.model_dump() for cost in self._costs],
|
||||
'response_latencies': [
|
||||
latency.model_dump() for latency in self._response_latencies
|
||||
],
|
||||
}
|
||||
|
||||
def reset(self):
|
||||
self._accumulated_cost = 0.0
|
||||
self._costs = []
|
||||
self._response_latencies = []
|
||||
|
||||
def log(self):
|
||||
"""Log the metrics."""
|
||||
|
||||
@ -92,13 +92,47 @@ def test_llm_init_with_metrics():
|
||||
) # because we didn't specify model_name in Metrics init
|
||||
|
||||
|
||||
@patch('openhands.llm.llm.litellm_completion')
|
||||
@patch('time.time')
|
||||
def test_response_latency_tracking(mock_time, mock_litellm_completion):
|
||||
# Mock time.time() to return controlled values
|
||||
mock_time.side_effect = [1000.0, 1002.5] # Start time, end time (2.5s difference)
|
||||
|
||||
# Mock the completion response with a specific ID
|
||||
mock_response = {
|
||||
'id': 'test-response-123',
|
||||
'choices': [{'message': {'content': 'Test response'}}],
|
||||
}
|
||||
mock_litellm_completion.return_value = mock_response
|
||||
|
||||
# Create LLM instance and make a completion call
|
||||
config = LLMConfig(model='gpt-4o', api_key='test_key')
|
||||
llm = LLM(config)
|
||||
response = llm.completion(messages=[{'role': 'user', 'content': 'Hello!'}])
|
||||
|
||||
# Verify the response latency was tracked correctly
|
||||
assert len(llm.metrics.response_latencies) == 1
|
||||
latency_record = llm.metrics.response_latencies[0]
|
||||
assert latency_record.model == 'gpt-4o'
|
||||
assert (
|
||||
latency_record.latency == 2.5
|
||||
) # Should be the difference between our mocked times
|
||||
assert latency_record.response_id == 'test-response-123'
|
||||
|
||||
# Verify the completion response was returned correctly
|
||||
assert response['id'] == 'test-response-123'
|
||||
assert response['choices'][0]['message']['content'] == 'Test response'
|
||||
|
||||
|
||||
def test_llm_reset():
|
||||
llm = LLM(LLMConfig(model='gpt-4o-mini', api_key='test_key'))
|
||||
initial_metrics = copy.deepcopy(llm.metrics)
|
||||
initial_metrics.add_cost(1.0)
|
||||
initial_metrics.add_response_latency(0.5, 'test-id')
|
||||
llm.reset()
|
||||
assert llm.metrics._accumulated_cost != initial_metrics._accumulated_cost
|
||||
assert llm.metrics._costs != initial_metrics._costs
|
||||
assert llm.metrics._response_latencies != initial_metrics._response_latencies
|
||||
assert isinstance(llm.metrics, Metrics)
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user