feat(eval): Response Latency Tracking (#5588)

Co-authored-by: openhands <openhands@all-hands.dev> Co-authored-by: Calvin Smith <calvin@all-hands.dev>
2025-12-26 05:48:36 +08:00 · 2024-12-13 14:51:13 -07:00 · 2024-12-13 14:51:13 -07:00 · 7ef6fa666d
commit 7ef6fa666d
parent e0626a5741
3 changed files with 75 additions and 0 deletions
--- a/openhands/llm/llm.py
+++ b/openhands/llm/llm.py
@ -199,9 +199,17 @@ class LLM(RetryMixin, DebugMixin):
                    }

            try:
+                # Record start time for latency measurement
+                start_time = time.time()
+
                # we don't support streaming here, thus we get a ModelResponse
                resp: ModelResponse = self._completion_unwrapped(*args, **kwargs)

+                # Calculate and record latency
+                latency = time.time() - start_time
+                response_id = resp.get('id', 'unknown')
+                self.metrics.add_response_latency(latency, response_id)
+
                non_fncall_response = copy.deepcopy(resp)
                if mock_function_calling:
                    assert len(resp.choices) == 1
@ -436,6 +444,11 @@ class LLM(RetryMixin, DebugMixin):
                self.metrics.accumulated_cost,
            )

+        # Add latency to stats if available
+        if self.metrics.response_latencies:
+            latest_latency = self.metrics.response_latencies[-1]
+            stats += 'Response Latency: %.3f seconds\n' % latest_latency.latency
+
        usage: Usage | None = response.get('usage')

        if usage:
--- a/openhands/llm/metrics.py
+++ b/openhands/llm/metrics.py
@ -9,15 +9,25 @@ class Cost(BaseModel):
    timestamp: float = Field(default_factory=time.time)


+class ResponseLatency(BaseModel):
+    """Metric tracking the round-trip time per completion call."""
+
+    model: str
+    latency: float
+    response_id: str
+
+
 class Metrics:
    """Metrics class can record various metrics during running and evaluation.
    Currently, we define the following metrics:
        accumulated_cost: the total cost (USD $) of the current LLM.
+        response_latency: the time taken for each LLM completion call.
    """

    def __init__(self, model_name: str = 'default') -> None:
        self._accumulated_cost: float = 0.0
        self._costs: list[Cost] = []
+        self._response_latencies: list[ResponseLatency] = []
        self.model_name = model_name

    @property
@ -34,26 +44,44 @@ class Metrics:
    def costs(self) -> list[Cost]:
        return self._costs

+    @property
+    def response_latencies(self) -> list[ResponseLatency]:
+        return self._response_latencies
+
    def add_cost(self, value: float) -> None:
        if value < 0:
            raise ValueError('Added cost cannot be negative.')
        self._accumulated_cost += value
        self._costs.append(Cost(cost=value, model=self.model_name))

+    def add_response_latency(self, value: float, response_id: str) -> None:
+        if value < 0:
+            raise ValueError('Response latency cannot be negative.')
+        self._response_latencies.append(
+            ResponseLatency(
+                latency=value, model=self.model_name, response_id=response_id
+            )
+        )
+
    def merge(self, other: 'Metrics') -> None:
        self._accumulated_cost += other.accumulated_cost
        self._costs += other._costs
+        self._response_latencies += other._response_latencies

    def get(self) -> dict:
        """Return the metrics in a dictionary."""
        return {
            'accumulated_cost': self._accumulated_cost,
            'costs': [cost.model_dump() for cost in self._costs],
+            'response_latencies': [
+                latency.model_dump() for latency in self._response_latencies
+            ],
        }

    def reset(self):
        self._accumulated_cost = 0.0
        self._costs = []
+        self._response_latencies = []

    def log(self):
        """Log the metrics."""
--- a/tests/unit/test_llm.py
+++ b/tests/unit/test_llm.py
@ -92,13 +92,47 @@ def test_llm_init_with_metrics():
    )  # because we didn't specify model_name in Metrics init


+@patch('openhands.llm.llm.litellm_completion')
+@patch('time.time')
+def test_response_latency_tracking(mock_time, mock_litellm_completion):
+    # Mock time.time() to return controlled values
+    mock_time.side_effect = [1000.0, 1002.5]  # Start time, end time (2.5s difference)
+
+    # Mock the completion response with a specific ID
+    mock_response = {
+        'id': 'test-response-123',
+        'choices': [{'message': {'content': 'Test response'}}],
+    }
+    mock_litellm_completion.return_value = mock_response
+
+    # Create LLM instance and make a completion call
+    config = LLMConfig(model='gpt-4o', api_key='test_key')
+    llm = LLM(config)
+    response = llm.completion(messages=[{'role': 'user', 'content': 'Hello!'}])
+
+    # Verify the response latency was tracked correctly
+    assert len(llm.metrics.response_latencies) == 1
+    latency_record = llm.metrics.response_latencies[0]
+    assert latency_record.model == 'gpt-4o'
+    assert (
+        latency_record.latency == 2.5
+    )  # Should be the difference between our mocked times
+    assert latency_record.response_id == 'test-response-123'
+
+    # Verify the completion response was returned correctly
+    assert response['id'] == 'test-response-123'
+    assert response['choices'][0]['message']['content'] == 'Test response'
+
+
 def test_llm_reset():
    llm = LLM(LLMConfig(model='gpt-4o-mini', api_key='test_key'))
    initial_metrics = copy.deepcopy(llm.metrics)
    initial_metrics.add_cost(1.0)
+    initial_metrics.add_response_latency(0.5, 'test-id')
    llm.reset()
    assert llm.metrics._accumulated_cost != initial_metrics._accumulated_cost
    assert llm.metrics._costs != initial_metrics._costs
+    assert llm.metrics._response_latencies != initial_metrics._response_latencies
    assert isinstance(llm.metrics, Metrics)