feat: add metrics related to cost for better observability (#1944)

* add metrics for total_cost * make lint * refact codeact * change metrics into llm * add costs list, add into state * refactor log completion * refactor and test others * make lint * Update opendevin/core/metrics.py Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk> * Update opendevin/llm/llm.py Co-authored-by: Xingyao Wang <xingyao6@illinois.edu> * refactor * add code --------- Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk> Co-authored-by: Xingyao Wang <xingyao6@illinois.edu>
2025-12-26 05:48:36 +08:00 · 2024-05-22 16:53:31 +08:00 · 2024-05-22 16:53:31 +08:00 · d18e6c85a0
commit d18e6c85a0
parent 34cccfe9cc
11 changed files with 93 additions and 29 deletions
--- a/agenthub/SWE_agent/agent.py
+++ b/agenthub/SWE_agent/agent.py
@ -42,7 +42,7 @@ class SWEAgent(Agent):
        self.running_memory.append(memory)

    def _think_act(self, messages: list[dict]) -> tuple[Action, str]:
-        resp = self.llm.completion(
+        resp = self.llm.do_completion(
            messages=messages,
            temperature=0.05,
        )
--- a/agenthub/codeact_agent/codeact_agent.py
+++ b/agenthub/codeact_agent/codeact_agent.py
@ -9,7 +9,6 @@ from agenthub.codeact_agent.prompt import (
 )
 from opendevin.controller.agent import Agent
 from opendevin.controller.state.state import State
-from opendevin.core.logger import opendevin_logger as logger
 from opendevin.events.action import (
    Action,
    AgentFinishAction,
@ -173,7 +172,6 @@ class CodeActAgent(Agent):
        Resets the CodeAct Agent.
        """
        super().reset()
-        self.cost_accumulator = 0

    def step(self, state: State) -> Action:
        """
@ -215,7 +213,7 @@ class CodeActAgent(Agent):
                f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task.'
            )

-        response = self.llm.completion(
+        response = self.llm.do_completion(
            messages=messages,
            stop=[
                '</execute_ipython>',
@ -225,8 +223,6 @@ class CodeActAgent(Agent):
            temperature=0.0,
        )

-        self.log_cost(response)
-
        action_str: str = parse_response(response)
        state.num_of_chars += sum(
            len(message['content']) for message in messages
@ -269,15 +265,3 @@ class CodeActAgent(Agent):

    def search_memory(self, query: str) -> list[str]:
        raise NotImplementedError('Implement this abstract method')
-
-    def log_cost(self, response):
-        try:
-            cur_cost = self.llm.completion_cost(response)
-        except Exception:
-            cur_cost = 0
-        self.cost_accumulator += cur_cost
-        logger.info(
-            'Cost: %.2f USD | Accumulated Cost: %.2f USD',
-            cur_cost,
-            self.cost_accumulator,
-        )
--- a/agenthub/micro/agent.py
+++ b/agenthub/micro/agent.py
@ -65,7 +65,7 @@ class MicroAgent(Agent):
            latest_user_message=latest_user_message,
        )
        messages = [{'content': prompt, 'role': 'user'}]
-        resp = self.llm.completion(messages=messages)
+        resp = self.llm.do_completion(messages=messages)
        action_resp = resp['choices'][0]['message']['content']
        state.num_of_chars += len(prompt) + len(action_resp)
        action = parse_response(action_resp)
--- a/agenthub/monologue_agent/agent.py
+++ b/agenthub/monologue_agent/agent.py
@ -242,7 +242,7 @@ class MonologueAgent(Agent):
            state.background_commands_obs,
        )
        messages = [{'content': prompt, 'role': 'user'}]
-        resp = self.llm.completion(messages=messages)
+        resp = self.llm.do_completion(messages=messages)
        action_resp = resp['choices'][0]['message']['content']
        state.num_of_chars += len(prompt) + len(action_resp)
        action = prompts.parse_action_response(action_resp)
--- a/agenthub/planner_agent/agent.py
+++ b/agenthub/planner_agent/agent.py
@ -43,7 +43,7 @@ class PlannerAgent(Agent):
            return AgentFinishAction()
        prompt = get_prompt(state)
        messages = [{'content': prompt, 'role': 'user'}]
-        resp = self.llm.completion(messages=messages)
+        resp = self.llm.do_completion(messages=messages)
        action_resp = resp['choices'][0]['message']['content']
        state.num_of_chars += len(prompt) + len(action_resp)
        action = parse_response(action_resp)
--- a/opendevin/controller/agent_controller.py
+++ b/opendevin/controller/agent_controller.py
@ -89,6 +89,8 @@ class AgentController:

    def update_state_after_step(self):
        self.state.updated_info = []
+        # update metrics especially for cost
+        self.state.metrics = self.agent.llm.metrics

    async def report_error(self, message: str, exception: Exception | None = None):
        self.state.error = message
--- a/opendevin/controller/state/state.py
+++ b/opendevin/controller/state/state.py
@ -4,6 +4,7 @@ from dataclasses import dataclass, field

 from opendevin.controller.state.task import RootTask
 from opendevin.core.logger import opendevin_logger as logger
+from opendevin.core.metrics import Metrics
 from opendevin.core.schema import AgentState
 from opendevin.events.action import (
    Action,
@ -30,6 +31,7 @@ class State:
    outputs: dict = field(default_factory=dict)
    error: str | None = None
    agent_state: AgentState = AgentState.LOADING
+    metrics: Metrics = Metrics()

    def save_to_session(self, sid: str):
        fs = get_file_store()
--- a/opendevin/core/metrics.py
+++ b/opendevin/core/metrics.py
@ -0,0 +1,46 @@
+class Metrics:
+    """
+    Metrics class can record various metrics during running and evaluation.
+    Currently we define the following metrics:
+        accumulated_cost: the total cost (USD $) of the current LLM.
+    """
+
+    def __init__(self) -> None:
+        self._accumulated_cost: float = 0.0
+        self._costs: list[float] = []
+
+    @property
+    def accumulated_cost(self) -> float:
+        return self._accumulated_cost
+
+    @accumulated_cost.setter
+    def accumulated_cost(self, value: float) -> None:
+        if value < 0:
+            raise ValueError('Total cost cannot be negative.')
+        self._accumulated_cost = value
+
+    @property
+    def costs(self) -> list:
+        return self._costs
+
+    def add_cost(self, value: float) -> None:
+        if value < 0:
+            raise ValueError('Added cost cannot be negative.')
+        self._accumulated_cost += value
+        self._costs.append(value)
+
+    def get(self):
+        """
+        Return the metrics in a dictionary.
+        """
+        return {'accumulated_cost': self._accumulated_cost, 'costs': self._costs}
+
+    def log(self):
+        """
+        Log the metrics.
+        """
+        metrics = self.get()
+        logs = ''
+        for key, value in metrics.items():
+            logs += f'{key}: {value}\n'
+        return logs
--- a/opendevin/llm/llm.py
+++ b/opendevin/llm/llm.py
@ -21,6 +21,7 @@ from tenacity import (
 from opendevin.core.config import config
 from opendevin.core.logger import llm_prompt_logger, llm_response_logger
 from opendevin.core.logger import opendevin_logger as logger
+from opendevin.core.metrics import Metrics

 __all__ = ['LLM']

@ -58,6 +59,7 @@ class LLM:
        max_input_tokens=None,
        max_output_tokens=None,
        llm_config=None,
+        metrics=None,
    ):
        """
        Initializes the LLM. If LLMConfig is passed, its values will be the fallback.
@ -77,7 +79,7 @@ class LLM:
            custom_llm_provider (str, optional): A custom LLM provider. Defaults to LLM_CUSTOM_LLM_PROVIDER.
            llm_timeout (int, optional): The maximum time to wait for a response in seconds. Defaults to LLM_TIMEOUT.
            llm_temperature (float, optional): The temperature for LLM sampling. Defaults to LLM_TEMPERATURE.
-
+            metrics (Metrics, optional): The metrics object to use. Defaults to None.
        """
        if llm_config is None:
            llm_config = config.llm
@ -112,6 +114,7 @@ class LLM:
            if max_output_tokens is not None
            else llm_config.max_output_tokens
        )
+        metrics = metrics if metrics is not None else Metrics()

        logger.info(f'Initializing LLM with model: {model}')
        self.model_name = model
@ -122,6 +125,7 @@ class LLM:
        self.max_output_tokens = max_output_tokens
        self.llm_timeout = llm_timeout
        self.custom_llm_provider = custom_llm_provider
+        self.metrics = metrics

        # litellm actually uses base Exception here for unknown model
        self.model_info = None
@ -200,6 +204,30 @@ class LLM:
        """
        return self._completion

+    def do_completion(self, *args, **kwargs):
+        """
+        Wrapper for the litellm completion function.
+
+        Check the complete documentation at https://litellm.vercel.app/docs/completion
+        """
+        resp = self._completion(*args, **kwargs)
+        self.post_completion(resp)
+        return resp
+
+    def post_completion(self, response: str) -> None:
+        """
+        Post-process the completion response.
+        """
+        try:
+            cur_cost = self.completion_cost(response)
+        except Exception:
+            cur_cost = 0
+        logger.info(
+            'Cost: %.2f USD | Accumulated Cost: %.2f USD',
+            cur_cost,
+            self.metrics.accumulated_cost,
+        )
+
    def get_token_count(self, messages):
        """
        Get the number of tokens in a list of messages.
@ -231,6 +259,7 @@ class LLM:
    def completion_cost(self, response):
        """
        Calculate the cost of a completion response based on the model.  Local models are treated as free.
+        Add the current cost into total cost in metrics.

        Args:
            response (list): A response from a model invocation.
@ -241,6 +270,7 @@ class LLM:
        if not self.is_local():
            try:
                cost = litellm_completion_cost(completion_response=response)
+                self.metrics.add_cost(cost)
                return cost
            except Exception:
                logger.warning('Cost calculation not supported for this model.')
--- a/opendevin/memory/condenser.py
+++ b/opendevin/memory/condenser.py
@ -16,7 +16,7 @@ class MemoryCondenser:

        try:
            messages = [{'content': summarize_prompt, 'role': 'user'}]
-            resp = llm.completion(messages=messages)
+            resp = llm.do_completion(messages=messages)
            summary_response = resp['choices'][0]['message']['content']
            return summary_response
        except Exception as e:
--- a/tests/unit/test_micro_agents.py
+++ b/tests/unit/test_micro_agents.py
@ -31,7 +31,7 @@ def test_coder_agent_with_summary():
    """
    mock_llm = MagicMock()
    content = json.dumps({'action': 'finish', 'args': {}})
-    mock_llm.completion.return_value = {'choices': [{'message': {'content': content}}]}
+    mock_llm.do_completion.return_value = {'choices': [{'message': {'content': content}}]}

    coder_agent = Agent.get_cls('CoderAgent')(llm=mock_llm)
    assert coder_agent is not None
@ -43,8 +43,8 @@ def test_coder_agent_with_summary():
    state = State(history=history, inputs={'summary': summary})
    coder_agent.step(state)

-    mock_llm.completion.assert_called_once()
-    _, kwargs = mock_llm.completion.call_args
+    mock_llm.do_completion.assert_called_once()
+    _, kwargs = mock_llm.do_completion.call_args
    prompt = kwargs['messages'][0]['content']
    assert task in prompt
    assert "Here's a summary of the codebase, as it relates to this task" in prompt
@ -58,7 +58,7 @@ def test_coder_agent_without_summary():
    """
    mock_llm = MagicMock()
    content = json.dumps({'action': 'finish', 'args': {}})
-    mock_llm.completion.return_value = {'choices': [{'message': {'content': content}}]}
+    mock_llm.do_completion.return_value = {'choices': [{'message': {'content': content}}]}

    coder_agent = Agent.get_cls('CoderAgent')(llm=mock_llm)
    assert coder_agent is not None
@ -69,8 +69,8 @@ def test_coder_agent_without_summary():
    state = State(history=history)
    coder_agent.step(state)

-    mock_llm.completion.assert_called_once()
-    _, kwargs = mock_llm.completion.call_args
+    mock_llm.do_completion.assert_called_once()
+    _, kwargs = mock_llm.do_completion.call_args
    prompt = kwargs['messages'][0]['content']
    assert task in prompt
    assert "Here's a summary of the codebase, as it relates to this task" not in prompt