feat: add metrics related to cost for better observability (#1944)

* add metrics for total_cost

* make lint

* refact codeact

* change metrics into llm

* add costs list, add into state

* refactor log completion

* refactor and test others

* make lint

* Update opendevin/core/metrics.py

Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk>

* Update opendevin/llm/llm.py

Co-authored-by: Xingyao Wang <xingyao6@illinois.edu>

* refactor

* add code

---------

Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk>
Co-authored-by: Xingyao Wang <xingyao6@illinois.edu>
This commit is contained in:
Yufan Song 2024-05-22 16:53:31 +08:00 committed by GitHub
parent 34cccfe9cc
commit d18e6c85a0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 93 additions and 29 deletions

View File

@ -42,7 +42,7 @@ class SWEAgent(Agent):
self.running_memory.append(memory)
def _think_act(self, messages: list[dict]) -> tuple[Action, str]:
resp = self.llm.completion(
resp = self.llm.do_completion(
messages=messages,
temperature=0.05,
)

View File

@ -9,7 +9,6 @@ from agenthub.codeact_agent.prompt import (
)
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.logger import opendevin_logger as logger
from opendevin.events.action import (
Action,
AgentFinishAction,
@ -173,7 +172,6 @@ class CodeActAgent(Agent):
Resets the CodeAct Agent.
"""
super().reset()
self.cost_accumulator = 0
def step(self, state: State) -> Action:
"""
@ -215,7 +213,7 @@ class CodeActAgent(Agent):
f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task.'
)
response = self.llm.completion(
response = self.llm.do_completion(
messages=messages,
stop=[
'</execute_ipython>',
@ -225,8 +223,6 @@ class CodeActAgent(Agent):
temperature=0.0,
)
self.log_cost(response)
action_str: str = parse_response(response)
state.num_of_chars += sum(
len(message['content']) for message in messages
@ -269,15 +265,3 @@ class CodeActAgent(Agent):
def search_memory(self, query: str) -> list[str]:
raise NotImplementedError('Implement this abstract method')
def log_cost(self, response):
try:
cur_cost = self.llm.completion_cost(response)
except Exception:
cur_cost = 0
self.cost_accumulator += cur_cost
logger.info(
'Cost: %.2f USD | Accumulated Cost: %.2f USD',
cur_cost,
self.cost_accumulator,
)

View File

@ -65,7 +65,7 @@ class MicroAgent(Agent):
latest_user_message=latest_user_message,
)
messages = [{'content': prompt, 'role': 'user'}]
resp = self.llm.completion(messages=messages)
resp = self.llm.do_completion(messages=messages)
action_resp = resp['choices'][0]['message']['content']
state.num_of_chars += len(prompt) + len(action_resp)
action = parse_response(action_resp)

View File

@ -242,7 +242,7 @@ class MonologueAgent(Agent):
state.background_commands_obs,
)
messages = [{'content': prompt, 'role': 'user'}]
resp = self.llm.completion(messages=messages)
resp = self.llm.do_completion(messages=messages)
action_resp = resp['choices'][0]['message']['content']
state.num_of_chars += len(prompt) + len(action_resp)
action = prompts.parse_action_response(action_resp)

View File

@ -43,7 +43,7 @@ class PlannerAgent(Agent):
return AgentFinishAction()
prompt = get_prompt(state)
messages = [{'content': prompt, 'role': 'user'}]
resp = self.llm.completion(messages=messages)
resp = self.llm.do_completion(messages=messages)
action_resp = resp['choices'][0]['message']['content']
state.num_of_chars += len(prompt) + len(action_resp)
action = parse_response(action_resp)

View File

@ -89,6 +89,8 @@ class AgentController:
def update_state_after_step(self):
self.state.updated_info = []
# update metrics especially for cost
self.state.metrics = self.agent.llm.metrics
async def report_error(self, message: str, exception: Exception | None = None):
self.state.error = message

View File

@ -4,6 +4,7 @@ from dataclasses import dataclass, field
from opendevin.controller.state.task import RootTask
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.metrics import Metrics
from opendevin.core.schema import AgentState
from opendevin.events.action import (
Action,
@ -30,6 +31,7 @@ class State:
outputs: dict = field(default_factory=dict)
error: str | None = None
agent_state: AgentState = AgentState.LOADING
metrics: Metrics = Metrics()
def save_to_session(self, sid: str):
fs = get_file_store()

46
opendevin/core/metrics.py Normal file
View File

@ -0,0 +1,46 @@
class Metrics:
"""
Metrics class can record various metrics during running and evaluation.
Currently we define the following metrics:
accumulated_cost: the total cost (USD $) of the current LLM.
"""
def __init__(self) -> None:
self._accumulated_cost: float = 0.0
self._costs: list[float] = []
@property
def accumulated_cost(self) -> float:
return self._accumulated_cost
@accumulated_cost.setter
def accumulated_cost(self, value: float) -> None:
if value < 0:
raise ValueError('Total cost cannot be negative.')
self._accumulated_cost = value
@property
def costs(self) -> list:
return self._costs
def add_cost(self, value: float) -> None:
if value < 0:
raise ValueError('Added cost cannot be negative.')
self._accumulated_cost += value
self._costs.append(value)
def get(self):
"""
Return the metrics in a dictionary.
"""
return {'accumulated_cost': self._accumulated_cost, 'costs': self._costs}
def log(self):
"""
Log the metrics.
"""
metrics = self.get()
logs = ''
for key, value in metrics.items():
logs += f'{key}: {value}\n'
return logs

View File

@ -21,6 +21,7 @@ from tenacity import (
from opendevin.core.config import config
from opendevin.core.logger import llm_prompt_logger, llm_response_logger
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.metrics import Metrics
__all__ = ['LLM']
@ -58,6 +59,7 @@ class LLM:
max_input_tokens=None,
max_output_tokens=None,
llm_config=None,
metrics=None,
):
"""
Initializes the LLM. If LLMConfig is passed, its values will be the fallback.
@ -77,7 +79,7 @@ class LLM:
custom_llm_provider (str, optional): A custom LLM provider. Defaults to LLM_CUSTOM_LLM_PROVIDER.
llm_timeout (int, optional): The maximum time to wait for a response in seconds. Defaults to LLM_TIMEOUT.
llm_temperature (float, optional): The temperature for LLM sampling. Defaults to LLM_TEMPERATURE.
metrics (Metrics, optional): The metrics object to use. Defaults to None.
"""
if llm_config is None:
llm_config = config.llm
@ -112,6 +114,7 @@ class LLM:
if max_output_tokens is not None
else llm_config.max_output_tokens
)
metrics = metrics if metrics is not None else Metrics()
logger.info(f'Initializing LLM with model: {model}')
self.model_name = model
@ -122,6 +125,7 @@ class LLM:
self.max_output_tokens = max_output_tokens
self.llm_timeout = llm_timeout
self.custom_llm_provider = custom_llm_provider
self.metrics = metrics
# litellm actually uses base Exception here for unknown model
self.model_info = None
@ -200,6 +204,30 @@ class LLM:
"""
return self._completion
def do_completion(self, *args, **kwargs):
"""
Wrapper for the litellm completion function.
Check the complete documentation at https://litellm.vercel.app/docs/completion
"""
resp = self._completion(*args, **kwargs)
self.post_completion(resp)
return resp
def post_completion(self, response: str) -> None:
"""
Post-process the completion response.
"""
try:
cur_cost = self.completion_cost(response)
except Exception:
cur_cost = 0
logger.info(
'Cost: %.2f USD | Accumulated Cost: %.2f USD',
cur_cost,
self.metrics.accumulated_cost,
)
def get_token_count(self, messages):
"""
Get the number of tokens in a list of messages.
@ -231,6 +259,7 @@ class LLM:
def completion_cost(self, response):
"""
Calculate the cost of a completion response based on the model. Local models are treated as free.
Add the current cost into total cost in metrics.
Args:
response (list): A response from a model invocation.
@ -241,6 +270,7 @@ class LLM:
if not self.is_local():
try:
cost = litellm_completion_cost(completion_response=response)
self.metrics.add_cost(cost)
return cost
except Exception:
logger.warning('Cost calculation not supported for this model.')

View File

@ -16,7 +16,7 @@ class MemoryCondenser:
try:
messages = [{'content': summarize_prompt, 'role': 'user'}]
resp = llm.completion(messages=messages)
resp = llm.do_completion(messages=messages)
summary_response = resp['choices'][0]['message']['content']
return summary_response
except Exception as e:

View File

@ -31,7 +31,7 @@ def test_coder_agent_with_summary():
"""
mock_llm = MagicMock()
content = json.dumps({'action': 'finish', 'args': {}})
mock_llm.completion.return_value = {'choices': [{'message': {'content': content}}]}
mock_llm.do_completion.return_value = {'choices': [{'message': {'content': content}}]}
coder_agent = Agent.get_cls('CoderAgent')(llm=mock_llm)
assert coder_agent is not None
@ -43,8 +43,8 @@ def test_coder_agent_with_summary():
state = State(history=history, inputs={'summary': summary})
coder_agent.step(state)
mock_llm.completion.assert_called_once()
_, kwargs = mock_llm.completion.call_args
mock_llm.do_completion.assert_called_once()
_, kwargs = mock_llm.do_completion.call_args
prompt = kwargs['messages'][0]['content']
assert task in prompt
assert "Here's a summary of the codebase, as it relates to this task" in prompt
@ -58,7 +58,7 @@ def test_coder_agent_without_summary():
"""
mock_llm = MagicMock()
content = json.dumps({'action': 'finish', 'args': {}})
mock_llm.completion.return_value = {'choices': [{'message': {'content': content}}]}
mock_llm.do_completion.return_value = {'choices': [{'message': {'content': content}}]}
coder_agent = Agent.get_cls('CoderAgent')(llm=mock_llm)
assert coder_agent is not None
@ -69,8 +69,8 @@ def test_coder_agent_without_summary():
state = State(history=history)
coder_agent.step(state)
mock_llm.completion.assert_called_once()
_, kwargs = mock_llm.completion.call_args
mock_llm.do_completion.assert_called_once()
_, kwargs = mock_llm.do_completion.call_args
prompt = kwargs['messages'][0]['content']
assert task in prompt
assert "Here's a summary of the codebase, as it relates to this task" not in prompt