From 7ef6fa666d61390194340133e154a33b4e1f6b26 Mon Sep 17 00:00:00 2001
From: Calvin Smith <email@cjsmith.io>
Date: Fri, 13 Dec 2024 14:51:13 -0700
Subject: [PATCH] feat(eval): Response Latency Tracking (#5588)

Co-authored-by: openhands <openhands@all-hands.dev>
Co-authored-by: Calvin Smith <calvin@all-hands.dev>
---
 openhands/llm/llm.py     | 13 +++++++++++++
 openhands/llm/metrics.py | 28 ++++++++++++++++++++++++++++
 tests/unit/test_llm.py   | 34 ++++++++++++++++++++++++++++++++++
 3 files changed, 75 insertions(+)

diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py
index 85010b3fec..6b87b33009 100644
--- a/openhands/llm/llm.py
+++ b/openhands/llm/llm.py
@@ -199,9 +199,17 @@ class LLM(RetryMixin, DebugMixin):
                     }
 
             try:
+                # Record start time for latency measurement
+                start_time = time.time()
+
                 # we don't support streaming here, thus we get a ModelResponse
                 resp: ModelResponse = self._completion_unwrapped(*args, **kwargs)
 
+                # Calculate and record latency
+                latency = time.time() - start_time
+                response_id = resp.get('id', 'unknown')
+                self.metrics.add_response_latency(latency, response_id)
+
                 non_fncall_response = copy.deepcopy(resp)
                 if mock_function_calling:
                     assert len(resp.choices) == 1
@@ -436,6 +444,11 @@ class LLM(RetryMixin, DebugMixin):
                 self.metrics.accumulated_cost,
             )
 
+        # Add latency to stats if available
+        if self.metrics.response_latencies:
+            latest_latency = self.metrics.response_latencies[-1]
+            stats += 'Response Latency: %.3f seconds\n' % latest_latency.latency
+
         usage: Usage | None = response.get('usage')
 
         if usage:
diff --git a/openhands/llm/metrics.py b/openhands/llm/metrics.py
index 182d48d93b..5388dd0402 100644
--- a/openhands/llm/metrics.py
+++ b/openhands/llm/metrics.py
@@ -9,15 +9,25 @@ class Cost(BaseModel):
     timestamp: float = Field(default_factory=time.time)
 
 
+class ResponseLatency(BaseModel):
+    """Metric tracking the round-trip time per completion call."""
+
+    model: str
+    latency: float
+    response_id: str
+
+
 class Metrics:
     """Metrics class can record various metrics during running and evaluation.
     Currently, we define the following metrics:
         accumulated_cost: the total cost (USD $) of the current LLM.
+        response_latency: the time taken for each LLM completion call.
     """
 
     def __init__(self, model_name: str = 'default') -> None:
         self._accumulated_cost: float = 0.0
         self._costs: list[Cost] = []
+        self._response_latencies: list[ResponseLatency] = []
         self.model_name = model_name
 
     @property
@@ -34,26 +44,44 @@ class Metrics:
     def costs(self) -> list[Cost]:
         return self._costs
 
+    @property
+    def response_latencies(self) -> list[ResponseLatency]:
+        return self._response_latencies
+
     def add_cost(self, value: float) -> None:
         if value < 0:
             raise ValueError('Added cost cannot be negative.')
         self._accumulated_cost += value
         self._costs.append(Cost(cost=value, model=self.model_name))
 
+    def add_response_latency(self, value: float, response_id: str) -> None:
+        if value < 0:
+            raise ValueError('Response latency cannot be negative.')
+        self._response_latencies.append(
+            ResponseLatency(
+                latency=value, model=self.model_name, response_id=response_id
+            )
+        )
+
     def merge(self, other: 'Metrics') -> None:
         self._accumulated_cost += other.accumulated_cost
         self._costs += other._costs
+        self._response_latencies += other._response_latencies
 
     def get(self) -> dict:
         """Return the metrics in a dictionary."""
         return {
             'accumulated_cost': self._accumulated_cost,
             'costs': [cost.model_dump() for cost in self._costs],
+            'response_latencies': [
+                latency.model_dump() for latency in self._response_latencies
+            ],
         }
 
     def reset(self):
         self._accumulated_cost = 0.0
         self._costs = []
+        self._response_latencies = []
 
     def log(self):
         """Log the metrics."""
diff --git a/tests/unit/test_llm.py b/tests/unit/test_llm.py
index 073743ea81..3dc1e1a797 100644
--- a/tests/unit/test_llm.py
+++ b/tests/unit/test_llm.py
@@ -92,13 +92,47 @@ def test_llm_init_with_metrics():
     )  # because we didn't specify model_name in Metrics init
 
 
+@patch('openhands.llm.llm.litellm_completion')
+@patch('time.time')
+def test_response_latency_tracking(mock_time, mock_litellm_completion):
+    # Mock time.time() to return controlled values
+    mock_time.side_effect = [1000.0, 1002.5]  # Start time, end time (2.5s difference)
+
+    # Mock the completion response with a specific ID
+    mock_response = {
+        'id': 'test-response-123',
+        'choices': [{'message': {'content': 'Test response'}}],
+    }
+    mock_litellm_completion.return_value = mock_response
+
+    # Create LLM instance and make a completion call
+    config = LLMConfig(model='gpt-4o', api_key='test_key')
+    llm = LLM(config)
+    response = llm.completion(messages=[{'role': 'user', 'content': 'Hello!'}])
+
+    # Verify the response latency was tracked correctly
+    assert len(llm.metrics.response_latencies) == 1
+    latency_record = llm.metrics.response_latencies[0]
+    assert latency_record.model == 'gpt-4o'
+    assert (
+        latency_record.latency == 2.5
+    )  # Should be the difference between our mocked times
+    assert latency_record.response_id == 'test-response-123'
+
+    # Verify the completion response was returned correctly
+    assert response['id'] == 'test-response-123'
+    assert response['choices'][0]['message']['content'] == 'Test response'
+
+
 def test_llm_reset():
     llm = LLM(LLMConfig(model='gpt-4o-mini', api_key='test_key'))
     initial_metrics = copy.deepcopy(llm.metrics)
     initial_metrics.add_cost(1.0)
+    initial_metrics.add_response_latency(0.5, 'test-id')
     llm.reset()
     assert llm.metrics._accumulated_cost != initial_metrics._accumulated_cost
     assert llm.metrics._costs != initial_metrics._costs
+    assert llm.metrics._response_latencies != initial_metrics._response_latencies
     assert isinstance(llm.metrics, Metrics)