From 7ef6fa666d61390194340133e154a33b4e1f6b26 Mon Sep 17 00:00:00 2001 From: Calvin Smith Date: Fri, 13 Dec 2024 14:51:13 -0700 Subject: [PATCH] feat(eval): Response Latency Tracking (#5588) Co-authored-by: openhands Co-authored-by: Calvin Smith --- openhands/llm/llm.py | 13 +++++++++++++ openhands/llm/metrics.py | 28 ++++++++++++++++++++++++++++ tests/unit/test_llm.py | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 75 insertions(+) diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py index 85010b3fec..6b87b33009 100644 --- a/openhands/llm/llm.py +++ b/openhands/llm/llm.py @@ -199,9 +199,17 @@ class LLM(RetryMixin, DebugMixin): } try: + # Record start time for latency measurement + start_time = time.time() + # we don't support streaming here, thus we get a ModelResponse resp: ModelResponse = self._completion_unwrapped(*args, **kwargs) + # Calculate and record latency + latency = time.time() - start_time + response_id = resp.get('id', 'unknown') + self.metrics.add_response_latency(latency, response_id) + non_fncall_response = copy.deepcopy(resp) if mock_function_calling: assert len(resp.choices) == 1 @@ -436,6 +444,11 @@ class LLM(RetryMixin, DebugMixin): self.metrics.accumulated_cost, ) + # Add latency to stats if available + if self.metrics.response_latencies: + latest_latency = self.metrics.response_latencies[-1] + stats += 'Response Latency: %.3f seconds\n' % latest_latency.latency + usage: Usage | None = response.get('usage') if usage: diff --git a/openhands/llm/metrics.py b/openhands/llm/metrics.py index 182d48d93b..5388dd0402 100644 --- a/openhands/llm/metrics.py +++ b/openhands/llm/metrics.py @@ -9,15 +9,25 @@ class Cost(BaseModel): timestamp: float = Field(default_factory=time.time) +class ResponseLatency(BaseModel): + """Metric tracking the round-trip time per completion call.""" + + model: str + latency: float + response_id: str + + class Metrics: """Metrics class can record various metrics during running and evaluation. Currently, we define the following metrics: accumulated_cost: the total cost (USD $) of the current LLM. + response_latency: the time taken for each LLM completion call. """ def __init__(self, model_name: str = 'default') -> None: self._accumulated_cost: float = 0.0 self._costs: list[Cost] = [] + self._response_latencies: list[ResponseLatency] = [] self.model_name = model_name @property @@ -34,26 +44,44 @@ class Metrics: def costs(self) -> list[Cost]: return self._costs + @property + def response_latencies(self) -> list[ResponseLatency]: + return self._response_latencies + def add_cost(self, value: float) -> None: if value < 0: raise ValueError('Added cost cannot be negative.') self._accumulated_cost += value self._costs.append(Cost(cost=value, model=self.model_name)) + def add_response_latency(self, value: float, response_id: str) -> None: + if value < 0: + raise ValueError('Response latency cannot be negative.') + self._response_latencies.append( + ResponseLatency( + latency=value, model=self.model_name, response_id=response_id + ) + ) + def merge(self, other: 'Metrics') -> None: self._accumulated_cost += other.accumulated_cost self._costs += other._costs + self._response_latencies += other._response_latencies def get(self) -> dict: """Return the metrics in a dictionary.""" return { 'accumulated_cost': self._accumulated_cost, 'costs': [cost.model_dump() for cost in self._costs], + 'response_latencies': [ + latency.model_dump() for latency in self._response_latencies + ], } def reset(self): self._accumulated_cost = 0.0 self._costs = [] + self._response_latencies = [] def log(self): """Log the metrics.""" diff --git a/tests/unit/test_llm.py b/tests/unit/test_llm.py index 073743ea81..3dc1e1a797 100644 --- a/tests/unit/test_llm.py +++ b/tests/unit/test_llm.py @@ -92,13 +92,47 @@ def test_llm_init_with_metrics(): ) # because we didn't specify model_name in Metrics init +@patch('openhands.llm.llm.litellm_completion') +@patch('time.time') +def test_response_latency_tracking(mock_time, mock_litellm_completion): + # Mock time.time() to return controlled values + mock_time.side_effect = [1000.0, 1002.5] # Start time, end time (2.5s difference) + + # Mock the completion response with a specific ID + mock_response = { + 'id': 'test-response-123', + 'choices': [{'message': {'content': 'Test response'}}], + } + mock_litellm_completion.return_value = mock_response + + # Create LLM instance and make a completion call + config = LLMConfig(model='gpt-4o', api_key='test_key') + llm = LLM(config) + response = llm.completion(messages=[{'role': 'user', 'content': 'Hello!'}]) + + # Verify the response latency was tracked correctly + assert len(llm.metrics.response_latencies) == 1 + latency_record = llm.metrics.response_latencies[0] + assert latency_record.model == 'gpt-4o' + assert ( + latency_record.latency == 2.5 + ) # Should be the difference between our mocked times + assert latency_record.response_id == 'test-response-123' + + # Verify the completion response was returned correctly + assert response['id'] == 'test-response-123' + assert response['choices'][0]['message']['content'] == 'Test response' + + def test_llm_reset(): llm = LLM(LLMConfig(model='gpt-4o-mini', api_key='test_key')) initial_metrics = copy.deepcopy(llm.metrics) initial_metrics.add_cost(1.0) + initial_metrics.add_response_latency(0.5, 'test-id') llm.reset() assert llm.metrics._accumulated_cost != initial_metrics._accumulated_cost assert llm.metrics._costs != initial_metrics._costs + assert llm.metrics._response_latencies != initial_metrics._response_latencies assert isinstance(llm.metrics, Metrics)