feat(eval): Response Latency Tracking (#5588)

Co-authored-by: openhands <openhands@all-hands.dev>
Co-authored-by: Calvin Smith <calvin@all-hands.dev>
This commit is contained in:
Calvin Smith 2024-12-13 14:51:13 -07:00 committed by GitHub
parent e0626a5741
commit 7ef6fa666d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 75 additions and 0 deletions

View File

@ -199,9 +199,17 @@ class LLM(RetryMixin, DebugMixin):
}
try:
# Record start time for latency measurement
start_time = time.time()
# we don't support streaming here, thus we get a ModelResponse
resp: ModelResponse = self._completion_unwrapped(*args, **kwargs)
# Calculate and record latency
latency = time.time() - start_time
response_id = resp.get('id', 'unknown')
self.metrics.add_response_latency(latency, response_id)
non_fncall_response = copy.deepcopy(resp)
if mock_function_calling:
assert len(resp.choices) == 1
@ -436,6 +444,11 @@ class LLM(RetryMixin, DebugMixin):
self.metrics.accumulated_cost,
)
# Add latency to stats if available
if self.metrics.response_latencies:
latest_latency = self.metrics.response_latencies[-1]
stats += 'Response Latency: %.3f seconds\n' % latest_latency.latency
usage: Usage | None = response.get('usage')
if usage:

View File

@ -9,15 +9,25 @@ class Cost(BaseModel):
timestamp: float = Field(default_factory=time.time)
class ResponseLatency(BaseModel):
"""Metric tracking the round-trip time per completion call."""
model: str
latency: float
response_id: str
class Metrics:
"""Metrics class can record various metrics during running and evaluation.
Currently, we define the following metrics:
accumulated_cost: the total cost (USD $) of the current LLM.
response_latency: the time taken for each LLM completion call.
"""
def __init__(self, model_name: str = 'default') -> None:
self._accumulated_cost: float = 0.0
self._costs: list[Cost] = []
self._response_latencies: list[ResponseLatency] = []
self.model_name = model_name
@property
@ -34,26 +44,44 @@ class Metrics:
def costs(self) -> list[Cost]:
return self._costs
@property
def response_latencies(self) -> list[ResponseLatency]:
return self._response_latencies
def add_cost(self, value: float) -> None:
if value < 0:
raise ValueError('Added cost cannot be negative.')
self._accumulated_cost += value
self._costs.append(Cost(cost=value, model=self.model_name))
def add_response_latency(self, value: float, response_id: str) -> None:
if value < 0:
raise ValueError('Response latency cannot be negative.')
self._response_latencies.append(
ResponseLatency(
latency=value, model=self.model_name, response_id=response_id
)
)
def merge(self, other: 'Metrics') -> None:
self._accumulated_cost += other.accumulated_cost
self._costs += other._costs
self._response_latencies += other._response_latencies
def get(self) -> dict:
"""Return the metrics in a dictionary."""
return {
'accumulated_cost': self._accumulated_cost,
'costs': [cost.model_dump() for cost in self._costs],
'response_latencies': [
latency.model_dump() for latency in self._response_latencies
],
}
def reset(self):
self._accumulated_cost = 0.0
self._costs = []
self._response_latencies = []
def log(self):
"""Log the metrics."""

View File

@ -92,13 +92,47 @@ def test_llm_init_with_metrics():
) # because we didn't specify model_name in Metrics init
@patch('openhands.llm.llm.litellm_completion')
@patch('time.time')
def test_response_latency_tracking(mock_time, mock_litellm_completion):
# Mock time.time() to return controlled values
mock_time.side_effect = [1000.0, 1002.5] # Start time, end time (2.5s difference)
# Mock the completion response with a specific ID
mock_response = {
'id': 'test-response-123',
'choices': [{'message': {'content': 'Test response'}}],
}
mock_litellm_completion.return_value = mock_response
# Create LLM instance and make a completion call
config = LLMConfig(model='gpt-4o', api_key='test_key')
llm = LLM(config)
response = llm.completion(messages=[{'role': 'user', 'content': 'Hello!'}])
# Verify the response latency was tracked correctly
assert len(llm.metrics.response_latencies) == 1
latency_record = llm.metrics.response_latencies[0]
assert latency_record.model == 'gpt-4o'
assert (
latency_record.latency == 2.5
) # Should be the difference between our mocked times
assert latency_record.response_id == 'test-response-123'
# Verify the completion response was returned correctly
assert response['id'] == 'test-response-123'
assert response['choices'][0]['message']['content'] == 'Test response'
def test_llm_reset():
llm = LLM(LLMConfig(model='gpt-4o-mini', api_key='test_key'))
initial_metrics = copy.deepcopy(llm.metrics)
initial_metrics.add_cost(1.0)
initial_metrics.add_response_latency(0.5, 'test-id')
llm.reset()
assert llm.metrics._accumulated_cost != initial_metrics._accumulated_cost
assert llm.metrics._costs != initial_metrics._costs
assert llm.metrics._response_latencies != initial_metrics._response_latencies
assert isinstance(llm.metrics, Metrics)